提交 dbff6d68 编写于 作者: W wangmeng28

Merge remote-tracking branch 'upstream/develop' into chinese_poetry

......@@ -11,6 +11,7 @@ import multiprocessing
import numpy as np
import paddle.v2 as paddle
from threading import local
import atexit
from data_utils.utility import read_manifest
from data_utils.utility import xmap_readers_mp
from data_utils.augmentor.augmentation import AugmentationPipeline
......@@ -59,6 +60,9 @@ class DataGenerator(object):
be passed forward directly without
converting to index sequence.
:type keep_transcription_text: bool
:param num_conv_layers: The number of convolution layer, used to compute
the sequence length.
:type num_conv_layers: int
"""
def __init__(self,
......@@ -74,7 +78,8 @@ class DataGenerator(object):
use_dB_normalization=True,
num_threads=multiprocessing.cpu_count() // 2,
random_seed=0,
keep_transcription_text=False):
keep_transcription_text=False,
num_conv_layers=2):
self._max_duration = max_duration
self._min_duration = min_duration
self._normalizer = FeatureNormalizer(mean_std_filepath)
......@@ -95,6 +100,7 @@ class DataGenerator(object):
self._local_data = local()
self._local_data.tar2info = {}
self._local_data.tar2object = {}
self._num_conv_layers = num_conv_layers
def process_utterance(self, filename, transcript):
"""Load, augment, featurize and normalize for speech data.
......@@ -213,7 +219,15 @@ class DataGenerator(object):
:return: Data feeding dict.
:rtype: dict
"""
return {"audio_spectrogram": 0, "transcript_text": 1}
feeding_dict = {
"audio_spectrogram": 0,
"transcript_text": 1,
"sequence_offset": 2,
"sequence_length": 3
}
for i in xrange(self._num_conv_layers):
feeding_dict["conv%d_index_range" % i] = len(feeding_dict)
return feeding_dict
@property
def vocab_size(self):
......@@ -274,13 +288,18 @@ class DataGenerator(object):
for instance in manifest:
yield instance
return xmap_readers_mp(
reader, cleanup_callback = xmap_readers_mp(
lambda instance: self.process_utterance(instance["audio_filepath"], instance["text"]),
reader,
self._num_threads,
4096,
order=True)
# register callback to main process
atexit.register(cleanup_callback)
return reader
def _padding_batch(self, batch, padding_to=-1, flatten=False):
"""
Padding audio features with zeros to make them have the same shape (or
......@@ -306,7 +325,30 @@ class DataGenerator(object):
padded_audio[:, :audio.shape[1]] = audio
if flatten:
padded_audio = padded_audio.flatten()
new_batch.append((padded_audio, text))
# Stride size for conv0 is (3, 2)
# Stride size for conv1 to convN is (1, 2)
# Same as the network, hard-coded here
padded_instance = [padded_audio, text]
padded_conv0_h = (padded_audio.shape[0] - 1) // 2 + 1
padded_conv0_w = (padded_audio.shape[1] - 1) // 3 + 1
valid_w = (audio.shape[1] - 1) // 3 + 1
padded_instance += [
[0], # sequence offset, always 0
[valid_w], # valid sequence length
# Index ranges for channel, height and width
# Please refer scale_sub_region layer to see details
[1, 32, 1, padded_conv0_h, valid_w + 1, padded_conv0_w]
]
pre_padded_h = padded_conv0_h
for i in xrange(self._num_conv_layers - 1):
padded_h = (pre_padded_h - 1) // 2 + 1
pre_padded_h = padded_h
padded_instance += [
[1, 32, 1, padded_h, valid_w + 1, padded_conv0_w]
]
new_batch.append(padded_instance)
return new_batch
def _batch_shuffle(self, manifest, batch_size, clipped=False):
......
......@@ -138,6 +138,10 @@ def xmap_readers_mp(mapper, reader, process_num, buffer_size, order=False):
out_queue.put(sample)
out_queue.put(end_flag)
def cleanup():
# kill all sub process and threads
os._exit(0)
def xreader():
# prepare shared memory
manager = Manager()
......@@ -174,4 +178,4 @@ def xmap_readers_mp(mapper, reader, process_num, buffer_size, order=False):
yield sample
sample = flush_queue.get()
return xreader
return xreader, cleanup
......@@ -70,7 +70,6 @@ FILES = glob.glob('kenlm/util/*.cc') \
FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')
# FILES + glob.glob('glog/src/*.cc')
FILES = [
fn for fn in FILES
if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
......@@ -107,7 +106,6 @@ decoders_module = [
'kenlm',
'openfst-1.6.3/src/include',
'ThreadPool',
#'glog/src'
],
libraries=LIBS,
extra_compile_args=ARGS)
......@@ -115,7 +113,7 @@ decoders_module = [
setup(
name='swig_decoders',
version='0.1',
version='1.0',
description="""CTC decoders""",
ext_modules=decoders_module,
py_modules=['swig_decoders'], )
......@@ -69,7 +69,8 @@ def infer():
augmentation_config='{}',
specgram_type=args.specgram_type,
num_threads=1,
keep_transcription_text=True)
keep_transcription_text=True,
num_conv_layers=args.num_conv_layers)
batch_reader = data_generator.batch_reader_creator(
manifest_path=args.infer_manifest,
batch_size=args.num_samples,
......@@ -100,10 +101,11 @@ def infer():
cutoff_top_n=args.cutoff_top_n,
vocab_list=vocab_list,
language_model_path=args.lang_model_path,
num_processes=args.num_proc_bsearch)
num_processes=args.num_proc_bsearch,
feeding_dict=data_generator.feeding)
error_rate_func = cer if args.error_rate_type == 'cer' else wer
target_transcripts = [transcript for _, transcript in infer_data]
target_transcripts = [data[1] for data in infer_data]
for target, result in zip(target_transcripts, result_transcripts):
print("\nTarget Transcription: %s\nOutput Transcription: %s" %
(target, result))
......
......@@ -165,7 +165,7 @@ class DeepSpeech2Model(object):
def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta,
beam_size, cutoff_prob, cutoff_top_n, vocab_list,
language_model_path, num_processes):
language_model_path, num_processes, feeding_dict):
"""Model inference. Infer the transcription for a batch of speech
utterances.
......@@ -195,6 +195,9 @@ class DeepSpeech2Model(object):
:type language_model_path: basestring|None
:param num_processes: Number of processes (CPU) for decoder.
:type num_processes: int
:param feeding_dict: Feeding is a map of field name and tuple index
of the data that reader returns.
:type feeding_dict: dict|list
:return: List of transcription texts.
:rtype: List of basestring
"""
......@@ -203,10 +206,13 @@ class DeepSpeech2Model(object):
self._inferer = paddle.inference.Inference(
output_layer=self._log_probs, parameters=self._parameters)
# run inference
infer_results = self._inferer.infer(input=infer_data)
num_steps = len(infer_results) // len(infer_data)
infer_results = self._inferer.infer(
input=infer_data, feeding=feeding_dict)
start_pos = [0] * (len(infer_data) + 1)
for i in xrange(len(infer_data)):
start_pos[i + 1] = start_pos[i] + infer_data[i][3][0]
probs_split = [
infer_results[i * num_steps:(i + 1) * num_steps]
infer_results[start_pos[i]:start_pos[i + 1]]
for i in xrange(0, len(infer_data))
]
# run decoder
......@@ -274,9 +280,25 @@ class DeepSpeech2Model(object):
text_data = paddle.layer.data(
name="transcript_text",
type=paddle.data_type.integer_value_sequence(vocab_size))
seq_offset_data = paddle.layer.data(
name='sequence_offset',
type=paddle.data_type.integer_value_sequence(1))
seq_len_data = paddle.layer.data(
name='sequence_length',
type=paddle.data_type.integer_value_sequence(1))
index_range_datas = []
for i in xrange(num_rnn_layers):
index_range_datas.append(
paddle.layer.data(
name='conv%d_index_range' % i,
type=paddle.data_type.dense_vector(6)))
self._log_probs, self._loss = deep_speech_v2_network(
audio_data=audio_data,
text_data=text_data,
seq_offset_data=seq_offset_data,
seq_len_data=seq_len_data,
index_range_datas=index_range_datas,
dict_size=vocab_size,
num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers,
......
......@@ -7,7 +7,7 @@ import paddle.v2 as paddle
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
padding, act):
padding, act, index_range_data):
"""Convolution layer with batch normalization.
:param input: Input layer.
......@@ -24,6 +24,8 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
:type padding: int|tuple|list
:param act: Activation type.
:type act: BaseActivation
:param index_range_data: Index range to indicate sub region.
:type index_range_data: LayerOutput
:return: Batch norm layer after convolution layer.
:rtype: LayerOutput
"""
......@@ -36,7 +38,11 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
padding=padding,
act=paddle.activation.Linear(),
bias_attr=False)
return paddle.layer.batch_norm(input=conv_layer, act=act)
batch_norm = paddle.layer.batch_norm(input=conv_layer, act=act)
# reset padding part to 0
scale_sub_region = paddle.layer.scale_sub_region(
batch_norm, index_range_data, value=0.0)
return scale_sub_region
def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights):
......@@ -136,13 +142,15 @@ def bidirectional_gru_bn_layer(name, input, size, act):
return paddle.layer.concat(input=[forward_gru, backward_gru])
def conv_group(input, num_stacks):
def conv_group(input, num_stacks, index_range_datas):
"""Convolution group with stacked convolution layers.
:param input: Input layer.
:type input: LayerOutput
:param num_stacks: Number of stacked convolution layers.
:type num_stacks: int
:param index_range_datas: Index ranges for each convolution layer.
:type index_range_datas: tuple|list
:return: Output layer of the convolution group.
:rtype: LayerOutput
"""
......@@ -153,7 +161,8 @@ def conv_group(input, num_stacks):
num_channels_out=32,
stride=(3, 2),
padding=(5, 20),
act=paddle.activation.BRelu())
act=paddle.activation.BRelu(),
index_range_data=index_range_datas[0])
for i in xrange(num_stacks - 1):
conv = conv_bn_layer(
input=conv,
......@@ -162,7 +171,8 @@ def conv_group(input, num_stacks):
num_channels_out=32,
stride=(1, 2),
padding=(5, 10),
act=paddle.activation.BRelu())
act=paddle.activation.BRelu(),
index_range_data=index_range_datas[i + 1])
output_num_channels = 32
output_height = 160 // pow(2, num_stacks) + 1
return conv, output_num_channels, output_height
......@@ -207,6 +217,9 @@ def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights):
def deep_speech_v2_network(audio_data,
text_data,
seq_offset_data,
seq_len_data,
index_range_datas,
dict_size,
num_conv_layers=2,
num_rnn_layers=3,
......@@ -219,6 +232,12 @@ def deep_speech_v2_network(audio_data,
:type audio_data: LayerOutput
:param text_data: Transcription text data layer.
:type text_data: LayerOutput
:param seq_offset_data: Sequence offset data layer.
:type seq_offset_data: LayerOutput
:param seq_len_data: Valid sequence length data layer.
:type seq_len_data: LayerOutput
:param index_range_datas: Index ranges data layers.
:type index_range_datas: tuple|list
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
......@@ -239,7 +258,9 @@ def deep_speech_v2_network(audio_data,
"""
# convolution group
conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
input=audio_data, num_stacks=num_conv_layers)
input=audio_data,
num_stacks=num_conv_layers,
index_range_datas=index_range_datas)
# convert data form convolution feature map to sequence of vectors
conv2seq = paddle.layer.block_expand(
input=conv_group_output,
......@@ -248,9 +269,16 @@ def deep_speech_v2_network(audio_data,
stride_y=1,
block_x=1,
block_y=conv_group_height)
# remove padding part
remove_padding_data = paddle.layer.sub_seq(
input=conv2seq,
offsets=seq_offset_data,
sizes=seq_len_data,
act=paddle.activation.Linear(),
bias_attr=False)
# rnn group
rnn_group_output = rnn_group(
input=conv2seq,
input=remove_padding_data,
size=rnn_size,
num_stacks=num_rnn_layers,
use_gru=use_gru,
......
......@@ -70,7 +70,8 @@ def evaluate():
augmentation_config='{}',
specgram_type=args.specgram_type,
num_threads=args.num_proc_data,
keep_transcription_text=True)
keep_transcription_text=True,
num_conv_layers=args.num_conv_layers)
batch_reader = data_generator.batch_reader_creator(
manifest_path=args.test_manifest,
batch_size=args.batch_size,
......@@ -103,8 +104,9 @@ def evaluate():
cutoff_top_n=args.cutoff_top_n,
vocab_list=vocab_list,
language_model_path=args.lang_model_path,
num_processes=args.num_proc_bsearch)
target_transcripts = [transcript for _, transcript in infer_data]
num_processes=args.num_proc_bsearch,
feeding_dict=data_generator.feeding)
target_transcripts = [data[1] for data in infer_data]
for target, result in zip(target_transcripts, result_transcripts):
error_sum += error_rate_func(target, result)
num_ins += 1
......
......@@ -88,7 +88,8 @@ def tune():
augmentation_config='{}',
specgram_type=args.specgram_type,
num_threads=args.num_proc_data,
keep_transcription_text=True)
keep_transcription_text=True,
num_conv_layers=args.num_conv_layers)
audio_data = paddle.layer.data(
name="audio_spectrogram",
......@@ -96,10 +97,25 @@ def tune():
text_data = paddle.layer.data(
name="transcript_text",
type=paddle.data_type.integer_value_sequence(data_generator.vocab_size))
seq_offset_data = paddle.layer.data(
name='sequence_offset',
type=paddle.data_type.integer_value_sequence(1))
seq_len_data = paddle.layer.data(
name='sequence_length',
type=paddle.data_type.integer_value_sequence(1))
index_range_datas = []
for i in xrange(args.num_rnn_layers):
index_range_datas.append(
paddle.layer.data(
name='conv%d_index_range' % i,
type=paddle.data_type.dense_vector(6)))
output_probs, _ = deep_speech_v2_network(
audio_data=audio_data,
text_data=text_data,
seq_offset_data=seq_offset_data,
seq_len_data=seq_len_data,
index_range_datas=index_range_datas,
dict_size=data_generator.vocab_size,
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
......@@ -156,15 +172,17 @@ def tune():
for infer_data in batch_reader():
if (args.num_batches >= 0) and (cur_batch >= args.num_batches):
break
infer_results = inferer.infer(input=infer_data)
num_steps = len(infer_results) // len(infer_data)
infer_results = inferer.infer(input=infer_data,
feeding=data_generator.feeding)
start_pos = [0] * (len(infer_data) + 1)
for i in xrange(len(infer_data)):
start_pos[i + 1] = start_pos[i] + infer_data[i][3][0]
probs_split = [
infer_results[i * num_steps:(i + 1) * num_steps]
for i in xrange(len(infer_data))
infer_results[start_pos[i]:start_pos[i + 1]]
for i in xrange(0, len(infer_data))
]
target_transcripts = [transcript for _, transcript in infer_data]
target_transcripts = [ data[1] for data in infer_data ]
num_ins += len(target_transcripts)
# grid search
......
......@@ -75,13 +75,15 @@ def train():
max_duration=args.max_duration,
min_duration=args.min_duration,
specgram_type=args.specgram_type,
num_threads=args.num_proc_data)
num_threads=args.num_proc_data,
num_conv_layers=args.num_conv_layers)
dev_generator = DataGenerator(
vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_path,
augmentation_config="{}",
specgram_type=args.specgram_type,
num_threads=args.num_proc_data)
num_threads=args.num_proc_data,
num_conv_layers=args.num_conv_layers)
train_batch_reader = train_generator.batch_reader_creator(
manifest_path=args.train_manifest,
batch_size=args.batch_size,
......
此差异已折叠。
......@@ -65,10 +65,11 @@ In below, we describe how to train DSSM model in PaddlePaddle. All the codes are
### Create a word vector table for the text
```python
def create_embedding(self, input, prefix=''):
'''
Create an embedding table whose name has a `prefix`.
'''
logger.info("create embedding table [%s] which dimention is %d" %
"""
Create word embedding. The `prefix` is added in front of the name of
embedding"s learnable parameter.
"""
logger.info("Create embedding table [%s] whose dimention is %d" %
(prefix, self.dnn_dims[0]))
emb = paddle.layer.embedding(
input=input,
......@@ -82,14 +83,15 @@ Since the input (embedding table) is a list of the IDs of the words correspondin
### CNN implementation
```python
def create_cnn(self, emb, prefix=''):
'''
"""
A multi-layer CNN.
:param emb: The word embedding.
:type emb: paddle.layer
:param prefix: The prefix will be added to of layers' names.
:type prefix: str
"""
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `cnn` parts.
'''
def create_conv(context_len, hidden_size, prefix):
key = "%s_%d_%d" % (prefix, context_len, hidden_size)
conv = paddle.networks.sequence_conv_pool(
......@@ -97,15 +99,13 @@ def create_cnn(self, emb, prefix=''):
context_len=context_len,
hidden_size=hidden_size,
# set parameter attr for parameter sharing
context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'),
fc_param_attr=ParamAttr(name=key + '_fc.w'),
fc_bias_attr=ParamAttr(name=key + '_fc.b'),
pool_bias_attr=ParamAttr(name=key + '_pool.b'))
context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"),
fc_param_attr=ParamAttr(name=key + "_fc.w"),
fc_bias_attr=ParamAttr(name=key + "_fc.b"),
pool_bias_attr=ParamAttr(name=key + "_pool.b"))
return conv
logger.info('create a sequence_conv_pool which context width is 3')
conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
logger.info('create a sequence_conv_pool which context width is 4')
conv_4 = create_conv(4, self.dnn_dims[1], "cnn")
return conv_3, conv_4
```
......@@ -118,9 +118,9 @@ RNN is suitable for learning variable length of the information
```python
def create_rnn(self, emb, prefix=''):
'''
"""
A GRU sentence vector learner.
'''
"""
gru = paddle.networks.simple_gru(
input=emb,
size=self.dnn_dims[1],
......@@ -136,14 +136,15 @@ def create_rnn(self, emb, prefix=''):
```python
def create_fc(self, emb, prefix=''):
'''
"""
A multi-layer fully connected neural networks.
:param emb: The output of the embedding layer
:type emb: paddle.layer
:param prefix: A prefix will be added to the layers' names.
:type prefix: str
"""
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `fc` parts.
'''
_input_layer = paddle.layer.pooling(
input=emb, pooling_type=paddle.pooling.Max())
fc = paddle.layer.fc(
......@@ -160,13 +161,10 @@ In the construction of FC, we use `paddle.layer.pooling` for the maximum pooling
```python
def create_dnn(self, sent_vec, prefix):
# if more than three layers exists, a fc layer will be added.
if len(self.dnn_dims) > 1:
_input_layer = sent_vec
for id, dim in enumerate(self.dnn_dims[1:]):
name = "%s_fc_%d_%d" % (prefix, id, dim)
logger.info("create fc layer [%s] which dimention is %d" %
(name, dim))
fc = paddle.layer.fc(
input=_input_layer,
size=dim,
......@@ -180,117 +178,12 @@ def create_dnn(self, sent_vec, prefix):
### Classification / Regression
The structure of classification and regression is similar. Below function can be used for both tasks.
```python
def _build_classification_or_regression_model(self, is_classification):
'''
Build a classification/regression model, and the cost is returned.
A Classification has 3 inputs:
- source sentence
- target sentence
- classification label
'''
# prepare inputs.
assert self.class_num
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
target = paddle.layer.data(
name='target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input',
type=paddle.data_type.integer_value(self.class_num)
if is_classification else paddle.data_type.dense_input)
prefixs = '_ _'.split(
) if self.share_semantic_generator else 'source target'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target'.split()
word_vecs = []
for id, input in enumerate([source, target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
if is_classification:
concated_vector = paddle.layer.concat(semantics)
prediction = paddle.layer.fc(
input=concated_vector,
size=self.class_num,
act=paddle.activation.Softmax())
cost = paddle.layer.classification_cost(
input=prediction, label=label)
else:
prediction = paddle.layer.cos_sim(*semantics)
cost = paddle.layer.square_error_cost(prediction, label)
if not self.is_infer:
return cost, prediction, label
return prediction
```
Please check the function `_build_classification_or_regression_model` in [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) for detail implementation.
### Pairwise Rank
Please check the function `_build_rank_model` in [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) for implementation.
```python
def _build_rank_model(self):
'''
Build a pairwise rank model, and the cost is returned.
A pairwise rank model has 3 inputs:
- source sentence
- left_target sentence
- right_target sentence
- label, 1 if left_target should be sorted in front of right_target, otherwise 0.
'''
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
left_target = paddle.layer.data(
name='left_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
right_target = paddle.layer.data(
name='right_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input', type=paddle.data_type.integer_value(1))
prefixs = '_ _ _'.split(
) if self.share_semantic_generator else 'source target target'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target target'.split()
word_vecs = []
for id, input in enumerate([source, left_target, right_target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
# cossim score of source and left_target
left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
# cossim score of source and right target
right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
# rank cost
cost = paddle.layer.rank_cost(left_score, right_score, label=label)
# prediction = left_score - right_score
# but this operator is not supported currently.
# so AUC will not used.
return cost, None, None
```
## Data Format
Below is a simple example for the data in `./data`
......@@ -347,67 +240,7 @@ The example of this format is as follows.
## Training
We use `python train.py -y 0 --model_arch 0` with the data in `./data/classification` to train a DSSM model for classification.
```
usage: train.py [-h] [-i TRAIN_DATA_PATH] [-t TEST_DATA_PATH]
[-s SOURCE_DIC_PATH] [--target_dic_path TARGET_DIC_PATH]
[-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE -a MODEL_ARCH
[--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
[--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
[--num_workers NUM_WORKERS] [--use_gpu USE_GPU] [-c CLASS_NUM]
[--model_output_prefix MODEL_OUTPUT_PREFIX]
[-g NUM_BATCHES_TO_LOG] [-e NUM_BATCHES_TO_TEST]
[-z NUM_BATCHES_TO_SAVE_MODEL]
PaddlePaddle DSSM example
optional arguments:
-h, --help show this help message and exit
-i TRAIN_DATA_PATH, --train_data_path TRAIN_DATA_PATH
path of training dataset
-t TEST_DATA_PATH, --test_data_path TEST_DATA_PATH
path of testing dataset
-s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
path of the source's word dic
--target_dic_path TARGET_DIC_PATH
path of the target's word dic, if not set, the
`source_dic_path` will be used
-b BATCH_SIZE, --batch_size BATCH_SIZE
size of mini-batch (default:32)
-p NUM_PASSES, --num_passes NUM_PASSES
number of passes to run(default:10)
-y MODEL_TYPE, --model_type MODEL_TYPE
model type, 0 for classification, 1 for pairwise rank,
2 for regression (default: classification)
-a MODEL_ARCH, --model_arch MODEL_ARCH
model architecture, 1 for CNN, 0 for FC, 2 for RNN
--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
whether to share network parameters between source and
target
--share_embed SHARE_EMBED
whether to share word embedding between source and
target
--dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32',
which means create a 4-layer dnn, demention of each
layer is 256, 128, 64 and 32
--num_workers NUM_WORKERS
num worker threads, default 1
--use_gpu USE_GPU whether to use GPU devices (default: False)
-c CLASS_NUM, --class_num CLASS_NUM
number of categories for classification task.
--model_output_prefix MODEL_OUTPUT_PREFIX
prefix of the path for model to store, (default: ./)
-g NUM_BATCHES_TO_LOG, --num_batches_to_log NUM_BATCHES_TO_LOG
number of batches to output train log, (default: 100)
-e NUM_BATCHES_TO_TEST, --num_batches_to_test NUM_BATCHES_TO_TEST
number of batches to test, (default: 200)
-z NUM_BATCHES_TO_SAVE_MODEL, --num_batches_to_save_model NUM_BATCHES_TO_SAVE_MODEL
number of batches to output model, (default: 400)
```
Parameter description:
We use `python train.py -y 0 --model_arch 0` with the data in `./data/classification` to train a DSSM model for classification. The paremeters to execute the script `train.py` can be found by execution `python infer.py --help`. Some important parameters are:
- `train_data_path` Training data path
- `test_data_path` Test data path, optional
......@@ -418,48 +251,8 @@ Parameter description:
- `dnn_dims` The dimension of each layer of the model is set, the default is `256,128,64,32`,with 4 layers.
## To predict using the trained model
```
usage: infer.py [-h] --model_path MODEL_PATH -i DATA_PATH -o
PREDICTION_OUTPUT_PATH -y MODEL_TYPE [-s SOURCE_DIC_PATH]
[--target_dic_path TARGET_DIC_PATH] -a MODEL_ARCH
[--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
[--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
[-c CLASS_NUM]
PaddlePaddle DSSM infer
optional arguments:
-h, --help show this help message and exit
--model_path MODEL_PATH
path of model parameters file
-i DATA_PATH, --data_path DATA_PATH
path of the dataset to infer
-o PREDICTION_OUTPUT_PATH, --prediction_output_path PREDICTION_OUTPUT_PATH
path to output the prediction
-y MODEL_TYPE, --model_type MODEL_TYPE
model type, 0 for classification, 1 for pairwise rank,
2 for regression (default: classification)
-s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
path of the source's word dic
--target_dic_path TARGET_DIC_PATH
path of the target's word dic, if not set, the
`source_dic_path` will be used
-a MODEL_ARCH, --model_arch MODEL_ARCH
model architecture, 1 for CNN, 0 for FC, 2 for RNN
--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
whether to share network parameters between source and
target
--share_embed SHARE_EMBED
whether to share word embedding between source and
target
--dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32',
which means create a 4-layer dnn, demention of each
layer is 256, 128, 64 and 32
-c CLASS_NUM, --class_num CLASS_NUM
number of categories for classification task.
```
Important parameters are
The paremeters to execute the script `infer.py` can be found by execution `python infer.py --help`. Some important parameters are:
- `data_path` Path for the data to predict
- `prediction_output_path` Prediction output path
......
......@@ -107,10 +107,11 @@ In below, we describe how to train DSSM model in PaddlePaddle. All the codes are
### Create a word vector table for the text
```python
def create_embedding(self, input, prefix=''):
'''
Create an embedding table whose name has a `prefix`.
'''
logger.info("create embedding table [%s] which dimention is %d" %
"""
Create word embedding. The `prefix` is added in front of the name of
embedding"s learnable parameter.
"""
logger.info("Create embedding table [%s] whose dimention is %d" %
(prefix, self.dnn_dims[0]))
emb = paddle.layer.embedding(
input=input,
......@@ -124,14 +125,15 @@ Since the input (embedding table) is a list of the IDs of the words correspondin
### CNN implementation
```python
def create_cnn(self, emb, prefix=''):
'''
"""
A multi-layer CNN.
:param emb: The word embedding.
:type emb: paddle.layer
:param prefix: The prefix will be added to of layers' names.
:type prefix: str
"""
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `cnn` parts.
'''
def create_conv(context_len, hidden_size, prefix):
key = "%s_%d_%d" % (prefix, context_len, hidden_size)
conv = paddle.networks.sequence_conv_pool(
......@@ -139,15 +141,13 @@ def create_cnn(self, emb, prefix=''):
context_len=context_len,
hidden_size=hidden_size,
# set parameter attr for parameter sharing
context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'),
fc_param_attr=ParamAttr(name=key + '_fc.w'),
fc_bias_attr=ParamAttr(name=key + '_fc.b'),
pool_bias_attr=ParamAttr(name=key + '_pool.b'))
context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"),
fc_param_attr=ParamAttr(name=key + "_fc.w"),
fc_bias_attr=ParamAttr(name=key + "_fc.b"),
pool_bias_attr=ParamAttr(name=key + "_pool.b"))
return conv
logger.info('create a sequence_conv_pool which context width is 3')
conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
logger.info('create a sequence_conv_pool which context width is 4')
conv_4 = create_conv(4, self.dnn_dims[1], "cnn")
return conv_3, conv_4
```
......@@ -160,9 +160,9 @@ RNN is suitable for learning variable length of the information
```python
def create_rnn(self, emb, prefix=''):
'''
"""
A GRU sentence vector learner.
'''
"""
gru = paddle.networks.simple_gru(
input=emb,
size=self.dnn_dims[1],
......@@ -178,14 +178,15 @@ def create_rnn(self, emb, prefix=''):
```python
def create_fc(self, emb, prefix=''):
'''
"""
A multi-layer fully connected neural networks.
:param emb: The output of the embedding layer
:type emb: paddle.layer
:param prefix: A prefix will be added to the layers' names.
:type prefix: str
"""
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `fc` parts.
'''
_input_layer = paddle.layer.pooling(
input=emb, pooling_type=paddle.pooling.Max())
fc = paddle.layer.fc(
......@@ -202,13 +203,10 @@ In the construction of FC, we use `paddle.layer.pooling` for the maximum pooling
```python
def create_dnn(self, sent_vec, prefix):
# if more than three layers exists, a fc layer will be added.
if len(self.dnn_dims) > 1:
_input_layer = sent_vec
for id, dim in enumerate(self.dnn_dims[1:]):
name = "%s_fc_%d_%d" % (prefix, id, dim)
logger.info("create fc layer [%s] which dimention is %d" %
(name, dim))
fc = paddle.layer.fc(
input=_input_layer,
size=dim,
......@@ -222,117 +220,12 @@ def create_dnn(self, sent_vec, prefix):
### Classification / Regression
The structure of classification and regression is similar. Below function can be used for both tasks.
```python
def _build_classification_or_regression_model(self, is_classification):
'''
Build a classification/regression model, and the cost is returned.
A Classification has 3 inputs:
- source sentence
- target sentence
- classification label
'''
# prepare inputs.
assert self.class_num
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
target = paddle.layer.data(
name='target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input',
type=paddle.data_type.integer_value(self.class_num)
if is_classification else paddle.data_type.dense_input)
prefixs = '_ _'.split(
) if self.share_semantic_generator else 'source target'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target'.split()
word_vecs = []
for id, input in enumerate([source, target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
if is_classification:
concated_vector = paddle.layer.concat(semantics)
prediction = paddle.layer.fc(
input=concated_vector,
size=self.class_num,
act=paddle.activation.Softmax())
cost = paddle.layer.classification_cost(
input=prediction, label=label)
else:
prediction = paddle.layer.cos_sim(*semantics)
cost = paddle.layer.square_error_cost(prediction, label)
if not self.is_infer:
return cost, prediction, label
return prediction
```
Please check the function `_build_classification_or_regression_model` in [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) for detail implementation.
### Pairwise Rank
Please check the function `_build_rank_model` in [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) for implementation.
```python
def _build_rank_model(self):
'''
Build a pairwise rank model, and the cost is returned.
A pairwise rank model has 3 inputs:
- source sentence
- left_target sentence
- right_target sentence
- label, 1 if left_target should be sorted in front of right_target, otherwise 0.
'''
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
left_target = paddle.layer.data(
name='left_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
right_target = paddle.layer.data(
name='right_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input', type=paddle.data_type.integer_value(1))
prefixs = '_ _ _'.split(
) if self.share_semantic_generator else 'source target target'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target target'.split()
word_vecs = []
for id, input in enumerate([source, left_target, right_target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
# cossim score of source and left_target
left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
# cossim score of source and right target
right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
# rank cost
cost = paddle.layer.rank_cost(left_score, right_score, label=label)
# prediction = left_score - right_score
# but this operator is not supported currently.
# so AUC will not used.
return cost, None, None
```
## Data Format
Below is a simple example for the data in `./data`
......@@ -389,67 +282,7 @@ The example of this format is as follows.
## Training
We use `python train.py -y 0 --model_arch 0` with the data in `./data/classification` to train a DSSM model for classification.
```
usage: train.py [-h] [-i TRAIN_DATA_PATH] [-t TEST_DATA_PATH]
[-s SOURCE_DIC_PATH] [--target_dic_path TARGET_DIC_PATH]
[-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE -a MODEL_ARCH
[--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
[--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
[--num_workers NUM_WORKERS] [--use_gpu USE_GPU] [-c CLASS_NUM]
[--model_output_prefix MODEL_OUTPUT_PREFIX]
[-g NUM_BATCHES_TO_LOG] [-e NUM_BATCHES_TO_TEST]
[-z NUM_BATCHES_TO_SAVE_MODEL]
PaddlePaddle DSSM example
optional arguments:
-h, --help show this help message and exit
-i TRAIN_DATA_PATH, --train_data_path TRAIN_DATA_PATH
path of training dataset
-t TEST_DATA_PATH, --test_data_path TEST_DATA_PATH
path of testing dataset
-s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
path of the source's word dic
--target_dic_path TARGET_DIC_PATH
path of the target's word dic, if not set, the
`source_dic_path` will be used
-b BATCH_SIZE, --batch_size BATCH_SIZE
size of mini-batch (default:32)
-p NUM_PASSES, --num_passes NUM_PASSES
number of passes to run(default:10)
-y MODEL_TYPE, --model_type MODEL_TYPE
model type, 0 for classification, 1 for pairwise rank,
2 for regression (default: classification)
-a MODEL_ARCH, --model_arch MODEL_ARCH
model architecture, 1 for CNN, 0 for FC, 2 for RNN
--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
whether to share network parameters between source and
target
--share_embed SHARE_EMBED
whether to share word embedding between source and
target
--dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32',
which means create a 4-layer dnn, demention of each
layer is 256, 128, 64 and 32
--num_workers NUM_WORKERS
num worker threads, default 1
--use_gpu USE_GPU whether to use GPU devices (default: False)
-c CLASS_NUM, --class_num CLASS_NUM
number of categories for classification task.
--model_output_prefix MODEL_OUTPUT_PREFIX
prefix of the path for model to store, (default: ./)
-g NUM_BATCHES_TO_LOG, --num_batches_to_log NUM_BATCHES_TO_LOG
number of batches to output train log, (default: 100)
-e NUM_BATCHES_TO_TEST, --num_batches_to_test NUM_BATCHES_TO_TEST
number of batches to test, (default: 200)
-z NUM_BATCHES_TO_SAVE_MODEL, --num_batches_to_save_model NUM_BATCHES_TO_SAVE_MODEL
number of batches to output model, (default: 400)
```
Parameter description:
We use `python train.py -y 0 --model_arch 0` with the data in `./data/classification` to train a DSSM model for classification. The paremeters to execute the script `train.py` can be found by execution `python infer.py --help`. Some important parameters are:
- `train_data_path` Training data path
- `test_data_path` Test data path, optional
......@@ -460,48 +293,8 @@ Parameter description:
- `dnn_dims` The dimension of each layer of the model is set, the default is `256,128,64,32`,with 4 layers.
## To predict using the trained model
```
usage: infer.py [-h] --model_path MODEL_PATH -i DATA_PATH -o
PREDICTION_OUTPUT_PATH -y MODEL_TYPE [-s SOURCE_DIC_PATH]
[--target_dic_path TARGET_DIC_PATH] -a MODEL_ARCH
[--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
[--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
[-c CLASS_NUM]
PaddlePaddle DSSM infer
optional arguments:
-h, --help show this help message and exit
--model_path MODEL_PATH
path of model parameters file
-i DATA_PATH, --data_path DATA_PATH
path of the dataset to infer
-o PREDICTION_OUTPUT_PATH, --prediction_output_path PREDICTION_OUTPUT_PATH
path to output the prediction
-y MODEL_TYPE, --model_type MODEL_TYPE
model type, 0 for classification, 1 for pairwise rank,
2 for regression (default: classification)
-s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
path of the source's word dic
--target_dic_path TARGET_DIC_PATH
path of the target's word dic, if not set, the
`source_dic_path` will be used
-a MODEL_ARCH, --model_arch MODEL_ARCH
model architecture, 1 for CNN, 0 for FC, 2 for RNN
--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
whether to share network parameters between source and
target
--share_embed SHARE_EMBED
whether to share word embedding between source and
target
--dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32',
which means create a 4-layer dnn, demention of each
layer is 256, 128, 64 and 32
-c CLASS_NUM, --class_num CLASS_NUM
number of categories for classification task.
```
Important parameters are
The paremeters to execute the script `infer.py` can be found by execution `python infer.py --help`. Some important parameters are:
- `data_path` Path for the data to predict
- `prediction_output_path` Prediction output path
......
......@@ -9,83 +9,81 @@ from utils import logger, ModelType, ModelArch, load_dic
parser = argparse.ArgumentParser(description="PaddlePaddle DSSM infer")
parser.add_argument(
'--model_path',
type=str,
required=True,
help="path of model parameters file")
"--model_path", type=str, required=True, help="The path of trained model.")
parser.add_argument(
'-i',
'--data_path',
"-i",
"--data_path",
type=str,
required=True,
help="path of the dataset to infer")
help="The path of the data for inferring.")
parser.add_argument(
'-o',
'--prediction_output_path',
"-o",
"--prediction_output_path",
type=str,
required=True,
help="path to output the prediction")
help="The path to save the predictions.")
parser.add_argument(
'-y',
'--model_type',
"-y",
"--model_type",
type=int,
required=True,
default=ModelType.CLASSIFICATION_MODE,
help=("model type, %d for classification, %d for pairwise rank, "
"%d for regression (default: classification)") %
help=("The model type: %d for classification, %d for pairwise rank, "
"%d for regression (default: classification).") %
(ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
ModelType.REGRESSION_MODE))
parser.add_argument(
'-s',
'--source_dic_path',
"-s",
"--source_dic_path",
type=str,
required=False,
help="path of the source's word dic")
help="The path of the source's word dictionary.")
parser.add_argument(
'--target_dic_path',
"--target_dic_path",
type=str,
required=False,
help=("path of the target's word dictionary, "
"if not set, the `source_dic_path` will be used"))
help=("The path of the target's word dictionary, "
"if this parameter is not set, the `source_dic_path` will be used."))
parser.add_argument(
'-a',
'--model_arch',
"-a",
"--model_arch",
type=int,
required=True,
default=ModelArch.CNN_MODE,
help="model architecture, %d for CNN, %d for FC, %d for RNN" %
(ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE))
parser.add_argument(
'--share_network_between_source_target',
"--share_network_between_source_target",
type=distutils.util.strtobool,
default=False,
help="whether to share network parameters between source and target")
parser.add_argument(
'--share_embed',
"--share_embed",
type=distutils.util.strtobool,
default=False,
help="whether to share word embedding between source and target")
parser.add_argument(
'--dnn_dims',
"--dnn_dims",
type=str,
default='256,128,64,32',
help=("dimentions of dnn layers, default is '256,128,64,32', "
"which means create a 4-layer dnn, "
"demention of each layer is 256, 128, 64 and 32"))
default="256,128,64,32",
help=("The dimentions of dnn layers, default is `256,128,64,32`, "
"which means a dnn with 4 layers with "
"dmentions 256, 128, 64 and 32 will be created."))
parser.add_argument(
'-c',
'--class_num',
"-c",
"--class_num",
type=int,
default=0,
help="number of categories for classification task.")
help="The number of categories for classification task.")
args = parser.parse_args()
args.model_type = ModelType(args.model_type)
args.model_arch = ModelArch(args.model_arch)
if args.model_type.is_classification():
assert args.class_num > 1, "--class_num should be set in classification task."
assert args.class_num > 1, ("The parameter class_num should be set "
"in classification task.")
layer_dims = map(int, args.dnn_dims.split(','))
layer_dims = map(int, args.dnn_dims.split(","))
args.target_dic_path = args.source_dic_path if not args.target_dic_path \
else args.target_dic_path
......@@ -94,8 +92,6 @@ paddle.init(use_gpu=False, trainer_count=1)
class Inferer(object):
def __init__(self, param_path):
logger.info("create DSSM model")
prediction = DSSM(
dnn_dims=layer_dims,
vocab_sizes=[
......@@ -110,14 +106,13 @@ class Inferer(object):
is_infer=True)()
# load parameter
logger.info("load model parameters from %s" % param_path)
logger.info("Load the trained model from %s." % param_path)
self.parameters = paddle.parameters.Parameters.from_tar(
open(param_path, 'r'))
open(param_path, "r"))
self.inferer = paddle.inference.Inference(
output_layer=prediction, parameters=self.parameters)
def infer(self, data_path):
logger.info("infer data...")
dataset = reader.Dataset(
train_path=data_path,
test_path=None,
......@@ -125,19 +120,20 @@ class Inferer(object):
target_dic_path=args.target_dic_path,
model_type=args.model_type, )
infer_reader = paddle.batch(dataset.infer, batch_size=1000)
logger.warning('write predictions to %s' % args.prediction_output_path)
logger.warning("Write predictions to %s." % args.prediction_output_path)
output_f = open(args.prediction_output_path, 'w')
output_f = open(args.prediction_output_path, "w")
for id, batch in enumerate(infer_reader()):
res = self.inferer.infer(input=batch)
predictions = [' '.join(map(str, x)) for x in res]
predictions = [" ".join(map(str, x)) for x in res]
assert len(batch) == len(predictions), (
"predict error, %d inputs, "
"but %d predictions") % (len(batch), len(predictions))
output_f.write('\n'.join(map(str, predictions)) + '\n')
"Error! %d inputs are given, "
"but only %d predictions are returned.") % (len(batch),
len(predictions))
output_f.write("\n".join(map(str, predictions)) + "\n")
if __name__ == '__main__':
if __name__ == "__main__":
inferer = Inferer(args.model_path)
inferer.infer(args.data_path)
......@@ -13,26 +13,33 @@ class DSSM(object):
class_num=None,
share_embed=False,
is_infer=False):
'''
@dnn_dims: list of int
dimentions of each layer in semantic vector generator.
@vocab_sizes: 2-d tuple
size of both left and right items.
@model_type: int
type of task, should be 'rank: 0', 'regression: 1' or 'classification: 2'
@model_arch: int
model architecture
@share_semantic_generator: bool
whether to share the semantic vector generator for both left and right.
@share_embed: bool
whether to share the embeddings between left and right.
@class_num: int
number of categories.
'''
"""
:param dnn_dims: The dimention of each layer in the semantic vector
generator.
:type dnn_dims: list of int
:param vocab_sizes: The size of left and right items.
:type vocab_sizes: A list having 2 elements.
:param model_type: The type of task to train the DSSM model. The value
should be "rank: 0", "regression: 1" or
"classification: 2".
:type model_type: int
:param model_arch: A value indicating the model architecture to use.
:type model_arch: int
:param share_semantic_generator: A flag indicating whether to share the
semantic vector between the left and
the right item.
:type share_semantic_generator: bool
:param share_embed: A floag indicating whether to share the embeddings
between the left and the right item.
:type share_embed: bool
:param class_num: The number of categories.
:type class_num: int
"""
assert len(vocab_sizes) == 2, (
"vocab_sizes specify the sizes left and right inputs, "
"and dim should be 2.")
assert len(dnn_dims) > 1, "more than two layers is needed."
"The vocab_sizes specifying the sizes left and right inputs. "
"Its dimension should be 2.")
assert len(dnn_dims) > 1, ("In the DNN model, more than two layers "
"are needed.")
self.dnn_dims = dnn_dims
self.vocab_sizes = vocab_sizes
......@@ -42,91 +49,89 @@ class DSSM(object):
self.model_arch = ModelArch(model_arch)
self.class_num = class_num
self.is_infer = is_infer
logger.warning("build DSSM model with config of %s, %s" %
logger.warning("Build DSSM model with config of %s, %s" %
(self.model_type, self.model_arch))
logger.info("vocabulary sizes: %s" % str(self.vocab_sizes))
logger.info("The vocabulary size is : %s" % str(self.vocab_sizes))
# bind model architecture
_model_arch = {
'cnn': self.create_cnn,
'fc': self.create_fc,
'rnn': self.create_rnn,
"cnn": self.create_cnn,
"fc": self.create_fc,
"rnn": self.create_rnn,
}
def _model_arch_creater(emb, prefix=''):
def _model_arch_creater(emb, prefix=""):
sent_vec = _model_arch.get(str(model_arch))(emb, prefix)
dnn = self.create_dnn(sent_vec, prefix)
return dnn
self.model_arch_creater = _model_arch_creater
# build model type
_model_type = {
'classification': self._build_classification_model,
'rank': self._build_rank_model,
'regression': self._build_regression_model,
"classification": self._build_classification_model,
"rank": self._build_rank_model,
"regression": self._build_regression_model,
}
print 'model type: ', str(self.model_type)
print("model type: ", str(self.model_type))
self.model_type_creater = _model_type[str(self.model_type)]
def __call__(self):
return self.model_type_creater()
def create_embedding(self, input, prefix=''):
'''
Create an embedding table whose name has a `prefix`.
'''
logger.info("create embedding table [%s] which dimention is %d" %
def create_embedding(self, input, prefix=""):
"""
Create word embedding. The `prefix` is added in front of the name of
embedding"s learnable parameter.
"""
logger.info("Create embedding table [%s] whose dimention is %d. " %
(prefix, self.dnn_dims[0]))
emb = paddle.layer.embedding(
input=input,
size=self.dnn_dims[0],
param_attr=ParamAttr(name='%s_emb.w' % prefix))
param_attr=ParamAttr(name="%s_emb.w" % prefix))
return emb
def create_fc(self, emb, prefix=''):
'''
def create_fc(self, emb, prefix=""):
"""
A multi-layer fully connected neural networks.
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between
more than one `fc` parts.
'''
:param emb: The output of the embedding layer
:type emb: paddle.layer
:param prefix: A prefix will be added to the layers' names.
:type prefix: str
"""
_input_layer = paddle.layer.pooling(
input=emb, pooling_type=paddle.pooling.Max())
fc = paddle.layer.fc(
input=_input_layer,
size=self.dnn_dims[1],
param_attr=ParamAttr(name='%s_fc.w' % prefix),
param_attr=ParamAttr(name="%s_fc.w" % prefix),
bias_attr=ParamAttr(name="%s_fc.b" % prefix, initial_std=0.))
return fc
def create_rnn(self, emb, prefix=''):
'''
def create_rnn(self, emb, prefix=""):
"""
A GRU sentence vector learner.
'''
"""
gru = paddle.networks.simple_gru(
input=emb,
size=self.dnn_dims[1],
mixed_param_attr=ParamAttr(name='%s_gru_mixed.w' % prefix),
mixed_param_attr=ParamAttr(name="%s_gru_mixed.w" % prefix),
mixed_bias_param_attr=ParamAttr(name="%s_gru_mixed.b" % prefix),
gru_param_attr=ParamAttr(name='%s_gru.w' % prefix),
gru_param_attr=ParamAttr(name="%s_gru.w" % prefix),
gru_bias_attr=ParamAttr(name="%s_gru.b" % prefix))
sent_vec = paddle.layer.last_seq(gru)
return sent_vec
def create_cnn(self, emb, prefix=''):
'''
def create_cnn(self, emb, prefix=""):
"""
A multi-layer CNN.
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between
more than one `cnn` parts.
'''
:param emb: The word embedding.
:type emb: paddle.layer
:param prefix: The prefix will be added to of layers' names.
:type prefix: str
"""
def create_conv(context_len, hidden_size, prefix):
key = "%s_%d_%d" % (prefix, context_len, hidden_size)
......@@ -135,15 +140,15 @@ class DSSM(object):
context_len=context_len,
hidden_size=hidden_size,
# set parameter attr for parameter sharing
context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'),
fc_param_attr=ParamAttr(name=key + '_fc.w'),
fc_bias_attr=ParamAttr(name=key + '_fc.b'),
pool_bias_attr=ParamAttr(name=key + '_pool.b'))
context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"),
fc_param_attr=ParamAttr(name=key + "_fc.w"),
fc_bias_attr=ParamAttr(name=key + "_fc.b"),
pool_bias_attr=ParamAttr(name=key + "_pool.b"))
return conv
logger.info('create a sequence_conv_pool which context width is 3')
logger.info("create a sequence_conv_pool which context width is 3")
conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
logger.info('create a sequence_conv_pool which context width is 4')
logger.info("create a sequence_conv_pool which context width is 4")
conv_4 = create_conv(4, self.dnn_dims[1], "cnn")
return conv_3, conv_4
......@@ -160,8 +165,8 @@ class DSSM(object):
input=_input_layer,
size=dim,
act=paddle.activation.Tanh(),
param_attr=ParamAttr(name='%s.w' % name),
bias_attr=ParamAttr(name='%s.b' % name, initial_std=0.))
param_attr=ParamAttr(name="%s.w" % name),
bias_attr=ParamAttr(name="%s.b" % name, initial_std=0.))
_input_layer = fc
return _input_layer
......@@ -178,7 +183,7 @@ class DSSM(object):
is_classification=False)
def _build_rank_model(self):
'''
"""
Build a pairwise rank model, and the cost is returned.
A pairwise rank model has 3 inputs:
......@@ -187,26 +192,26 @@ class DSSM(object):
- right_target sentence
- label, 1 if left_target should be sorted in front of
right_target, otherwise 0.
'''
"""
logger.info("build rank model")
assert self.model_type.is_rank()
source = paddle.layer.data(
name='source_input',
name="source_input",
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
left_target = paddle.layer.data(
name='left_target_input',
name="left_target_input",
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
right_target = paddle.layer.data(
name='right_target_input',
name="right_target_input",
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
if not self.is_infer:
label = paddle.layer.data(
name='label_input', type=paddle.data_type.integer_value(1))
name="label_input", type=paddle.data_type.integer_value(1))
prefixs = '_ _ _'.split(
) if self.share_semantic_generator else 'source target target'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target target'.split()
prefixs = "_ _ _".split(
) if self.share_semantic_generator else "source target target".split()
embed_prefixs = "_ _".split(
) if self.share_embed else "source target target".split()
word_vecs = []
for id, input in enumerate([source, left_target, right_target]):
......@@ -218,9 +223,9 @@ class DSSM(object):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
# cossim score of source and left_target
# The cosine similarity score of source and left_target.
left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
# cossim score of source and right target
# The cosine similarity score of source and right target.
right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
if not self.is_infer:
......@@ -233,34 +238,33 @@ class DSSM(object):
return right_score
def _build_classification_or_regression_model(self, is_classification):
'''
"""
Build a classification/regression model, and the cost is returned.
A Classification has 3 inputs:
The classification/regression task expects 3 inputs:
- source sentence
- target sentence
- classification label
'''
"""
if is_classification:
# prepare inputs.
assert self.class_num
source = paddle.layer.data(
name='source_input',
name="source_input",
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
target = paddle.layer.data(
name='target_input',
name="target_input",
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input',
name="label_input",
type=paddle.data_type.integer_value(self.class_num)
if is_classification else paddle.data_type.dense_vector(1))
prefixs = '_ _'.split(
) if self.share_semantic_generator else 'source target'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target'.split()
prefixs = "_ _".split(
) if self.share_semantic_generator else "source target".split()
embed_prefixs = "_ _".split(
) if self.share_embed else "source target".split()
word_vecs = []
for id, input in enumerate([source, target]):
......
......@@ -9,120 +9,129 @@ from utils import TaskType, load_dic, logger, ModelType, ModelArch, display_args
parser = argparse.ArgumentParser(description="PaddlePaddle DSSM example")
parser.add_argument(
'-i',
'--train_data_path',
"-i",
"--train_data_path",
type=str,
required=False,
help="path of training dataset")
help="The path of training data.")
parser.add_argument(
'-t',
'--test_data_path',
"-t",
"--test_data_path",
type=str,
required=False,
help="path of testing dataset")
help="The path of testing data.")
parser.add_argument(
'-s',
'--source_dic_path',
"-s",
"--source_dic_path",
type=str,
required=False,
help="path of the source's word dic")
help="The path of the source's word dictionary.")
parser.add_argument(
'--target_dic_path',
"--target_dic_path",
type=str,
required=False,
help=("path of the target's word dictionary, "
"if not set, the `source_dic_path` will be used"))
help=("The path of the target's word dictionary, "
"if this parameter is not set, the `source_dic_path` will be used"))
parser.add_argument(
'-b',
'--batch_size',
"-b",
"--batch_size",
type=int,
default=32,
help="size of mini-batch (default:32)")
help="The size of mini-batch (default:32).")
parser.add_argument(
'-p',
'--num_passes',
"-p",
"--num_passes",
type=int,
default=10,
help="number of passes to run(default:10)")
help="The number of passes to run(default:10).")
parser.add_argument(
'-y',
'--model_type',
"-y",
"--model_type",
type=int,
required=True,
default=ModelType.CLASSIFICATION_MODE,
help="model type, %d for classification, %d for pairwise rank, %d for regression (default: classification)"
% (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
ModelType.REGRESSION_MODE))
help=("model type, %d for classification, %d for pairwise rank, "
"%d for regression (default: classification).") %
(ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
ModelType.REGRESSION_MODE))
parser.add_argument(
'-a',
'--model_arch',
"-a",
"--model_arch",
type=int,
required=True,
default=ModelArch.CNN_MODE,
help="model architecture, %d for CNN, %d for FC, %d for RNN" %
help="The model architecture, %d for CNN, %d for FC, %d for RNN." %
(ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE))
parser.add_argument(
'--share_network_between_source_target',
"--share_network_between_source_target",
type=distutils.util.strtobool,
default=False,
help="whether to share network parameters between source and target")
help="Whether to share network parameters between source and target.")
parser.add_argument(
'--share_embed',
"--share_embed",
type=distutils.util.strtobool,
default=False,
help="whether to share word embedding between source and target")
help="Whether to share word embedding between source and target.")
parser.add_argument(
'--dnn_dims',
"--dnn_dims",
type=str,
default='256,128,64,32',
help="dimentions of dnn layers, default is '256,128,64,32', which means create a 4-layer dnn, demention of each layer is 256, 128, 64 and 32"
)
default="256,128,64,32",
help=("The dimentions of dnn layers, default is '256,128,64,32', "
"which means create a 4-layer dnn. The dimention of each layer is "
"'256, 128, 64 and 32'."))
parser.add_argument(
'--num_workers', type=int, default=1, help="num worker threads, default 1")
"--num_workers",
type=int,
default=1,
help="The number of worker threads, default 1.")
parser.add_argument(
'--use_gpu',
"--use_gpu",
type=distutils.util.strtobool,
default=False,
help="whether to use GPU devices (default: False)")
help="Whether to use GPU devices (default: False)")
parser.add_argument(
'-c',
'--class_num',
"-c",
"--class_num",
type=int,
default=0,
help="number of categories for classification task.")
help="The number of categories for classification task.")
parser.add_argument(
'--model_output_prefix',
"--model_output_prefix",
type=str,
default="./",
help="prefix of the path for model to store, (default: ./)")
help="The prefix of the path to store the trained models (default: ./).")
parser.add_argument(
'-g',
'--num_batches_to_log',
"-g",
"--num_batches_to_log",
type=int,
default=100,
help="number of batches to output train log, (default: 100)")
help=("The log period. Every num_batches_to_test batches, "
"a training log will be printed. (default: 100)"))
parser.add_argument(
'-e',
'--num_batches_to_test',
"-e",
"--num_batches_to_test",
type=int,
default=200,
help="number of batches to test, (default: 200)")
help=("The test period. Every num_batches_to_save_model batches, "
"the specified test sample will be test (default: 200)."))
parser.add_argument(
'-z',
'--num_batches_to_save_model',
"-z",
"--num_batches_to_save_model",
type=int,
default=400,
help="number of batches to output model, (default: 400)")
help=("Every num_batches_to_save_model batches, "
"a trained model will be saved (default: 400)."))
# arguments check.
args = parser.parse_args()
args.model_type = ModelType(args.model_type)
args.model_arch = ModelArch(args.model_arch)
if args.model_type.is_classification():
assert args.class_num > 1, "--class_num should be set in classification task."
assert args.class_num > 1, ("The parameter class_num should be set in "
"classification task.")
layer_dims = [int(i) for i in args.dnn_dims.split(',')]
args.target_dic_path = args.source_dic_path if not args.target_dic_path else args.target_dic_path
layer_dims = [int(i) for i in args.dnn_dims.split(",")]
args.target_dic_path = args.source_dic_path if not \
args.target_dic_path else args.target_dic_path
def train(train_data_path=None,
......@@ -138,15 +147,15 @@ def train(train_data_path=None,
class_num=None,
num_workers=1,
use_gpu=False):
'''
"""
Train the DSSM.
'''
default_train_path = './data/rank/train.txt'
default_test_path = './data/rank/test.txt'
default_dic_path = './data/vocab.txt'
"""
default_train_path = "./data/rank/train.txt"
default_test_path = "./data/rank/test.txt"
default_dic_path = "./data/vocab.txt"
if not model_type.is_rank():
default_train_path = './data/classification/train.txt'
default_test_path = './data/classification/test.txt'
default_train_path = "./data/classification/train.txt"
default_test_path = "./data/classification/test.txt"
use_default_data = not train_data_path
......@@ -200,19 +209,19 @@ def train(train_data_path=None,
feeding = {}
if model_type.is_classification() or model_type.is_regression():
feeding = {'source_input': 0, 'target_input': 1, 'label_input': 2}
feeding = {"source_input": 0, "target_input": 1, "label_input": 2}
else:
feeding = {
'source_input': 0,
'left_target_input': 1,
'right_target_input': 2,
'label_input': 3
"source_input": 0,
"left_target_input": 1,
"right_target_input": 2,
"label_input": 3
}
def _event_handler(event):
'''
"""
Define batch handler
'''
"""
if isinstance(event, paddle.event.EndIteration):
# output train log
if event.batch_id % args.num_batches_to_log == 0:
......@@ -249,7 +258,7 @@ def train(train_data_path=None,
logger.info("Training has finished.")
if __name__ == '__main__':
if __name__ == "__main__":
display_args(args)
train(
train_data_path=args.train_data_path,
......
......@@ -8,7 +8,7 @@ logger.setLevel(logging.INFO)
def mode_attr_name(mode):
return mode.upper() + '_MODE'
return mode.upper() + "_MODE"
def create_attrs(cls):
......@@ -17,9 +17,9 @@ def create_attrs(cls):
def make_check_method(cls):
'''
"""
create methods for classes.
'''
"""
def method(mode):
def _method(self):
......@@ -28,7 +28,7 @@ def make_check_method(cls):
return _method
for id, mode in enumerate(cls.modes):
setattr(cls, 'is_' + mode, method(mode))
setattr(cls, "is_" + mode, method(mode))
def make_create_method(cls):
......@@ -41,10 +41,10 @@ def make_create_method(cls):
return _method
for id, mode in enumerate(cls.modes):
setattr(cls, 'create_' + mode, method(mode))
setattr(cls, "create_" + mode, method(mode))
def make_str_method(cls, type_name='unk'):
def make_str_method(cls, type_name="unk"):
def _str_(self):
for mode in cls.modes:
if self.mode == getattr(cls, mode_attr_name(mode)):
......@@ -53,9 +53,9 @@ def make_str_method(cls, type_name='unk'):
def _hash_(self):
return self.mode
setattr(cls, '__str__', _str_)
setattr(cls, '__repr__', _str_)
setattr(cls, '__hash__', _hash_)
setattr(cls, "__str__", _str_)
setattr(cls, "__repr__", _str_)
setattr(cls, "__hash__", _hash_)
cls.__name__ = type_name
......@@ -65,7 +65,7 @@ def _init_(self, mode, cls):
elif isinstance(mode, cls):
self.mode = mode.mode
else:
raise Exception("wrong mode type, get type: %s, value: %s" %
raise Exception("A wrong mode type, get type: %s, value: %s." %
(type(mode), mode))
......@@ -77,21 +77,21 @@ def build_mode_class(cls):
class TaskType(object):
modes = 'train test infer'.split()
modes = "train test infer".split()
def __init__(self, mode):
_init_(self, mode, TaskType)
class ModelType:
modes = 'classification rank regression'.split()
modes = "classification rank regression".split()
def __init__(self, mode):
_init_(self, mode, ModelType)
class ModelArch:
modes = 'fc cnn rnn'.split()
modes = "fc cnn rnn".split()
def __init__(self, mode):
_init_(self, mode, ModelArch)
......@@ -103,22 +103,16 @@ build_mode_class(ModelArch)
def sent2ids(sent, vocab):
'''
"""
transform a sentence to a list of ids.
@sent: str
a sentence.
@vocab: dict
a word dic
'''
"""
return [vocab.get(w, UNK) for w in sent.split()]
def load_dic(path):
'''
word dic format:
each line is a word
'''
"""
The format of word dictionary : each line is a word.
"""
dic = {}
with open(path) as f:
for id, line in enumerate(f):
......@@ -128,13 +122,6 @@ def load_dic(path):
def display_args(args):
logger.info("arguments passed by command line:")
logger.info("The arguments passed by command line is :")
for k, v in sorted(v for v in vars(args).items()):
logger.info("{}:\t{}".format(k, v))
if __name__ == '__main__':
t = TaskType(1)
t = TaskType.create_train()
print t
print 'is', t.is_train()
......@@ -33,7 +33,6 @@ def ngram_lm(hidden_size, emb_size, dict_size, gram_num=4, is_train=True):
num_classes=dict_size,
param_attr=paddle.attr.Param(name="nce_w"),
bias_attr=paddle.attr.Param(name="nce_b"),
act=paddle.activation.Sigmoid(),
num_neg_samples=25,
neg_distribution=None)
else:
......@@ -41,7 +40,7 @@ def ngram_lm(hidden_size, emb_size, dict_size, gram_num=4, is_train=True):
size=dict_size,
input=paddle.layer.trans_full_matrix_projection(
hidden_layer, param_attr=paddle.attr.Param(name="nce_w")),
act=paddle.activation.Sigmoid(),
act=paddle.activation.Softmax(),
bias_attr=paddle.attr.Param(name="nce_b"))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册