提交 dbff6d68 编写于 作者: W wangmeng28

Merge remote-tracking branch 'upstream/develop' into chinese_poetry

...@@ -11,6 +11,7 @@ import multiprocessing ...@@ -11,6 +11,7 @@ import multiprocessing
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
from threading import local from threading import local
import atexit
from data_utils.utility import read_manifest from data_utils.utility import read_manifest
from data_utils.utility import xmap_readers_mp from data_utils.utility import xmap_readers_mp
from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.augmentor.augmentation import AugmentationPipeline
...@@ -59,6 +60,9 @@ class DataGenerator(object): ...@@ -59,6 +60,9 @@ class DataGenerator(object):
be passed forward directly without be passed forward directly without
converting to index sequence. converting to index sequence.
:type keep_transcription_text: bool :type keep_transcription_text: bool
:param num_conv_layers: The number of convolution layer, used to compute
the sequence length.
:type num_conv_layers: int
""" """
def __init__(self, def __init__(self,
...@@ -74,7 +78,8 @@ class DataGenerator(object): ...@@ -74,7 +78,8 @@ class DataGenerator(object):
use_dB_normalization=True, use_dB_normalization=True,
num_threads=multiprocessing.cpu_count() // 2, num_threads=multiprocessing.cpu_count() // 2,
random_seed=0, random_seed=0,
keep_transcription_text=False): keep_transcription_text=False,
num_conv_layers=2):
self._max_duration = max_duration self._max_duration = max_duration
self._min_duration = min_duration self._min_duration = min_duration
self._normalizer = FeatureNormalizer(mean_std_filepath) self._normalizer = FeatureNormalizer(mean_std_filepath)
...@@ -95,6 +100,7 @@ class DataGenerator(object): ...@@ -95,6 +100,7 @@ class DataGenerator(object):
self._local_data = local() self._local_data = local()
self._local_data.tar2info = {} self._local_data.tar2info = {}
self._local_data.tar2object = {} self._local_data.tar2object = {}
self._num_conv_layers = num_conv_layers
def process_utterance(self, filename, transcript): def process_utterance(self, filename, transcript):
"""Load, augment, featurize and normalize for speech data. """Load, augment, featurize and normalize for speech data.
...@@ -213,7 +219,15 @@ class DataGenerator(object): ...@@ -213,7 +219,15 @@ class DataGenerator(object):
:return: Data feeding dict. :return: Data feeding dict.
:rtype: dict :rtype: dict
""" """
return {"audio_spectrogram": 0, "transcript_text": 1} feeding_dict = {
"audio_spectrogram": 0,
"transcript_text": 1,
"sequence_offset": 2,
"sequence_length": 3
}
for i in xrange(self._num_conv_layers):
feeding_dict["conv%d_index_range" % i] = len(feeding_dict)
return feeding_dict
@property @property
def vocab_size(self): def vocab_size(self):
...@@ -274,13 +288,18 @@ class DataGenerator(object): ...@@ -274,13 +288,18 @@ class DataGenerator(object):
for instance in manifest: for instance in manifest:
yield instance yield instance
return xmap_readers_mp( reader, cleanup_callback = xmap_readers_mp(
lambda instance: self.process_utterance(instance["audio_filepath"], instance["text"]), lambda instance: self.process_utterance(instance["audio_filepath"], instance["text"]),
reader, reader,
self._num_threads, self._num_threads,
4096, 4096,
order=True) order=True)
# register callback to main process
atexit.register(cleanup_callback)
return reader
def _padding_batch(self, batch, padding_to=-1, flatten=False): def _padding_batch(self, batch, padding_to=-1, flatten=False):
""" """
Padding audio features with zeros to make them have the same shape (or Padding audio features with zeros to make them have the same shape (or
...@@ -306,7 +325,30 @@ class DataGenerator(object): ...@@ -306,7 +325,30 @@ class DataGenerator(object):
padded_audio[:, :audio.shape[1]] = audio padded_audio[:, :audio.shape[1]] = audio
if flatten: if flatten:
padded_audio = padded_audio.flatten() padded_audio = padded_audio.flatten()
new_batch.append((padded_audio, text))
# Stride size for conv0 is (3, 2)
# Stride size for conv1 to convN is (1, 2)
# Same as the network, hard-coded here
padded_instance = [padded_audio, text]
padded_conv0_h = (padded_audio.shape[0] - 1) // 2 + 1
padded_conv0_w = (padded_audio.shape[1] - 1) // 3 + 1
valid_w = (audio.shape[1] - 1) // 3 + 1
padded_instance += [
[0], # sequence offset, always 0
[valid_w], # valid sequence length
# Index ranges for channel, height and width
# Please refer scale_sub_region layer to see details
[1, 32, 1, padded_conv0_h, valid_w + 1, padded_conv0_w]
]
pre_padded_h = padded_conv0_h
for i in xrange(self._num_conv_layers - 1):
padded_h = (pre_padded_h - 1) // 2 + 1
pre_padded_h = padded_h
padded_instance += [
[1, 32, 1, padded_h, valid_w + 1, padded_conv0_w]
]
new_batch.append(padded_instance)
return new_batch return new_batch
def _batch_shuffle(self, manifest, batch_size, clipped=False): def _batch_shuffle(self, manifest, batch_size, clipped=False):
......
...@@ -138,6 +138,10 @@ def xmap_readers_mp(mapper, reader, process_num, buffer_size, order=False): ...@@ -138,6 +138,10 @@ def xmap_readers_mp(mapper, reader, process_num, buffer_size, order=False):
out_queue.put(sample) out_queue.put(sample)
out_queue.put(end_flag) out_queue.put(end_flag)
def cleanup():
# kill all sub process and threads
os._exit(0)
def xreader(): def xreader():
# prepare shared memory # prepare shared memory
manager = Manager() manager = Manager()
...@@ -174,4 +178,4 @@ def xmap_readers_mp(mapper, reader, process_num, buffer_size, order=False): ...@@ -174,4 +178,4 @@ def xmap_readers_mp(mapper, reader, process_num, buffer_size, order=False):
yield sample yield sample
sample = flush_queue.get() sample = flush_queue.get()
return xreader return xreader, cleanup
...@@ -70,7 +70,6 @@ FILES = glob.glob('kenlm/util/*.cc') \ ...@@ -70,7 +70,6 @@ FILES = glob.glob('kenlm/util/*.cc') \
FILES += glob.glob('openfst-1.6.3/src/lib/*.cc') FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')
# FILES + glob.glob('glog/src/*.cc')
FILES = [ FILES = [
fn for fn in FILES fn for fn in FILES
if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith( if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
...@@ -107,7 +106,6 @@ decoders_module = [ ...@@ -107,7 +106,6 @@ decoders_module = [
'kenlm', 'kenlm',
'openfst-1.6.3/src/include', 'openfst-1.6.3/src/include',
'ThreadPool', 'ThreadPool',
#'glog/src'
], ],
libraries=LIBS, libraries=LIBS,
extra_compile_args=ARGS) extra_compile_args=ARGS)
...@@ -115,7 +113,7 @@ decoders_module = [ ...@@ -115,7 +113,7 @@ decoders_module = [
setup( setup(
name='swig_decoders', name='swig_decoders',
version='0.1', version='1.0',
description="""CTC decoders""", description="""CTC decoders""",
ext_modules=decoders_module, ext_modules=decoders_module,
py_modules=['swig_decoders'], ) py_modules=['swig_decoders'], )
...@@ -69,7 +69,8 @@ def infer(): ...@@ -69,7 +69,8 @@ def infer():
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=1, num_threads=1,
keep_transcription_text=True) keep_transcription_text=True,
num_conv_layers=args.num_conv_layers)
batch_reader = data_generator.batch_reader_creator( batch_reader = data_generator.batch_reader_creator(
manifest_path=args.infer_manifest, manifest_path=args.infer_manifest,
batch_size=args.num_samples, batch_size=args.num_samples,
...@@ -100,10 +101,11 @@ def infer(): ...@@ -100,10 +101,11 @@ def infer():
cutoff_top_n=args.cutoff_top_n, cutoff_top_n=args.cutoff_top_n,
vocab_list=vocab_list, vocab_list=vocab_list,
language_model_path=args.lang_model_path, language_model_path=args.lang_model_path,
num_processes=args.num_proc_bsearch) num_processes=args.num_proc_bsearch,
feeding_dict=data_generator.feeding)
error_rate_func = cer if args.error_rate_type == 'cer' else wer error_rate_func = cer if args.error_rate_type == 'cer' else wer
target_transcripts = [transcript for _, transcript in infer_data] target_transcripts = [data[1] for data in infer_data]
for target, result in zip(target_transcripts, result_transcripts): for target, result in zip(target_transcripts, result_transcripts):
print("\nTarget Transcription: %s\nOutput Transcription: %s" % print("\nTarget Transcription: %s\nOutput Transcription: %s" %
(target, result)) (target, result))
......
...@@ -165,7 +165,7 @@ class DeepSpeech2Model(object): ...@@ -165,7 +165,7 @@ class DeepSpeech2Model(object):
def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta, def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta,
beam_size, cutoff_prob, cutoff_top_n, vocab_list, beam_size, cutoff_prob, cutoff_top_n, vocab_list,
language_model_path, num_processes): language_model_path, num_processes, feeding_dict):
"""Model inference. Infer the transcription for a batch of speech """Model inference. Infer the transcription for a batch of speech
utterances. utterances.
...@@ -195,6 +195,9 @@ class DeepSpeech2Model(object): ...@@ -195,6 +195,9 @@ class DeepSpeech2Model(object):
:type language_model_path: basestring|None :type language_model_path: basestring|None
:param num_processes: Number of processes (CPU) for decoder. :param num_processes: Number of processes (CPU) for decoder.
:type num_processes: int :type num_processes: int
:param feeding_dict: Feeding is a map of field name and tuple index
of the data that reader returns.
:type feeding_dict: dict|list
:return: List of transcription texts. :return: List of transcription texts.
:rtype: List of basestring :rtype: List of basestring
""" """
...@@ -203,10 +206,13 @@ class DeepSpeech2Model(object): ...@@ -203,10 +206,13 @@ class DeepSpeech2Model(object):
self._inferer = paddle.inference.Inference( self._inferer = paddle.inference.Inference(
output_layer=self._log_probs, parameters=self._parameters) output_layer=self._log_probs, parameters=self._parameters)
# run inference # run inference
infer_results = self._inferer.infer(input=infer_data) infer_results = self._inferer.infer(
num_steps = len(infer_results) // len(infer_data) input=infer_data, feeding=feeding_dict)
start_pos = [0] * (len(infer_data) + 1)
for i in xrange(len(infer_data)):
start_pos[i + 1] = start_pos[i] + infer_data[i][3][0]
probs_split = [ probs_split = [
infer_results[i * num_steps:(i + 1) * num_steps] infer_results[start_pos[i]:start_pos[i + 1]]
for i in xrange(0, len(infer_data)) for i in xrange(0, len(infer_data))
] ]
# run decoder # run decoder
...@@ -274,9 +280,25 @@ class DeepSpeech2Model(object): ...@@ -274,9 +280,25 @@ class DeepSpeech2Model(object):
text_data = paddle.layer.data( text_data = paddle.layer.data(
name="transcript_text", name="transcript_text",
type=paddle.data_type.integer_value_sequence(vocab_size)) type=paddle.data_type.integer_value_sequence(vocab_size))
seq_offset_data = paddle.layer.data(
name='sequence_offset',
type=paddle.data_type.integer_value_sequence(1))
seq_len_data = paddle.layer.data(
name='sequence_length',
type=paddle.data_type.integer_value_sequence(1))
index_range_datas = []
for i in xrange(num_rnn_layers):
index_range_datas.append(
paddle.layer.data(
name='conv%d_index_range' % i,
type=paddle.data_type.dense_vector(6)))
self._log_probs, self._loss = deep_speech_v2_network( self._log_probs, self._loss = deep_speech_v2_network(
audio_data=audio_data, audio_data=audio_data,
text_data=text_data, text_data=text_data,
seq_offset_data=seq_offset_data,
seq_len_data=seq_len_data,
index_range_datas=index_range_datas,
dict_size=vocab_size, dict_size=vocab_size,
num_conv_layers=num_conv_layers, num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers, num_rnn_layers=num_rnn_layers,
......
...@@ -7,7 +7,7 @@ import paddle.v2 as paddle ...@@ -7,7 +7,7 @@ import paddle.v2 as paddle
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
padding, act): padding, act, index_range_data):
"""Convolution layer with batch normalization. """Convolution layer with batch normalization.
:param input: Input layer. :param input: Input layer.
...@@ -24,6 +24,8 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, ...@@ -24,6 +24,8 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
:type padding: int|tuple|list :type padding: int|tuple|list
:param act: Activation type. :param act: Activation type.
:type act: BaseActivation :type act: BaseActivation
:param index_range_data: Index range to indicate sub region.
:type index_range_data: LayerOutput
:return: Batch norm layer after convolution layer. :return: Batch norm layer after convolution layer.
:rtype: LayerOutput :rtype: LayerOutput
""" """
...@@ -36,7 +38,11 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, ...@@ -36,7 +38,11 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
padding=padding, padding=padding,
act=paddle.activation.Linear(), act=paddle.activation.Linear(),
bias_attr=False) bias_attr=False)
return paddle.layer.batch_norm(input=conv_layer, act=act) batch_norm = paddle.layer.batch_norm(input=conv_layer, act=act)
# reset padding part to 0
scale_sub_region = paddle.layer.scale_sub_region(
batch_norm, index_range_data, value=0.0)
return scale_sub_region
def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights): def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights):
...@@ -136,13 +142,15 @@ def bidirectional_gru_bn_layer(name, input, size, act): ...@@ -136,13 +142,15 @@ def bidirectional_gru_bn_layer(name, input, size, act):
return paddle.layer.concat(input=[forward_gru, backward_gru]) return paddle.layer.concat(input=[forward_gru, backward_gru])
def conv_group(input, num_stacks): def conv_group(input, num_stacks, index_range_datas):
"""Convolution group with stacked convolution layers. """Convolution group with stacked convolution layers.
:param input: Input layer. :param input: Input layer.
:type input: LayerOutput :type input: LayerOutput
:param num_stacks: Number of stacked convolution layers. :param num_stacks: Number of stacked convolution layers.
:type num_stacks: int :type num_stacks: int
:param index_range_datas: Index ranges for each convolution layer.
:type index_range_datas: tuple|list
:return: Output layer of the convolution group. :return: Output layer of the convolution group.
:rtype: LayerOutput :rtype: LayerOutput
""" """
...@@ -153,7 +161,8 @@ def conv_group(input, num_stacks): ...@@ -153,7 +161,8 @@ def conv_group(input, num_stacks):
num_channels_out=32, num_channels_out=32,
stride=(3, 2), stride=(3, 2),
padding=(5, 20), padding=(5, 20),
act=paddle.activation.BRelu()) act=paddle.activation.BRelu(),
index_range_data=index_range_datas[0])
for i in xrange(num_stacks - 1): for i in xrange(num_stacks - 1):
conv = conv_bn_layer( conv = conv_bn_layer(
input=conv, input=conv,
...@@ -162,7 +171,8 @@ def conv_group(input, num_stacks): ...@@ -162,7 +171,8 @@ def conv_group(input, num_stacks):
num_channels_out=32, num_channels_out=32,
stride=(1, 2), stride=(1, 2),
padding=(5, 10), padding=(5, 10),
act=paddle.activation.BRelu()) act=paddle.activation.BRelu(),
index_range_data=index_range_datas[i + 1])
output_num_channels = 32 output_num_channels = 32
output_height = 160 // pow(2, num_stacks) + 1 output_height = 160 // pow(2, num_stacks) + 1
return conv, output_num_channels, output_height return conv, output_num_channels, output_height
...@@ -207,6 +217,9 @@ def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights): ...@@ -207,6 +217,9 @@ def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights):
def deep_speech_v2_network(audio_data, def deep_speech_v2_network(audio_data,
text_data, text_data,
seq_offset_data,
seq_len_data,
index_range_datas,
dict_size, dict_size,
num_conv_layers=2, num_conv_layers=2,
num_rnn_layers=3, num_rnn_layers=3,
...@@ -219,6 +232,12 @@ def deep_speech_v2_network(audio_data, ...@@ -219,6 +232,12 @@ def deep_speech_v2_network(audio_data,
:type audio_data: LayerOutput :type audio_data: LayerOutput
:param text_data: Transcription text data layer. :param text_data: Transcription text data layer.
:type text_data: LayerOutput :type text_data: LayerOutput
:param seq_offset_data: Sequence offset data layer.
:type seq_offset_data: LayerOutput
:param seq_len_data: Valid sequence length data layer.
:type seq_len_data: LayerOutput
:param index_range_datas: Index ranges data layers.
:type index_range_datas: tuple|list
:param dict_size: Dictionary size for tokenized transcription. :param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int :type dict_size: int
:param num_conv_layers: Number of stacking convolution layers. :param num_conv_layers: Number of stacking convolution layers.
...@@ -239,7 +258,9 @@ def deep_speech_v2_network(audio_data, ...@@ -239,7 +258,9 @@ def deep_speech_v2_network(audio_data,
""" """
# convolution group # convolution group
conv_group_output, conv_group_num_channels, conv_group_height = conv_group( conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
input=audio_data, num_stacks=num_conv_layers) input=audio_data,
num_stacks=num_conv_layers,
index_range_datas=index_range_datas)
# convert data form convolution feature map to sequence of vectors # convert data form convolution feature map to sequence of vectors
conv2seq = paddle.layer.block_expand( conv2seq = paddle.layer.block_expand(
input=conv_group_output, input=conv_group_output,
...@@ -248,9 +269,16 @@ def deep_speech_v2_network(audio_data, ...@@ -248,9 +269,16 @@ def deep_speech_v2_network(audio_data,
stride_y=1, stride_y=1,
block_x=1, block_x=1,
block_y=conv_group_height) block_y=conv_group_height)
# remove padding part
remove_padding_data = paddle.layer.sub_seq(
input=conv2seq,
offsets=seq_offset_data,
sizes=seq_len_data,
act=paddle.activation.Linear(),
bias_attr=False)
# rnn group # rnn group
rnn_group_output = rnn_group( rnn_group_output = rnn_group(
input=conv2seq, input=remove_padding_data,
size=rnn_size, size=rnn_size,
num_stacks=num_rnn_layers, num_stacks=num_rnn_layers,
use_gru=use_gru, use_gru=use_gru,
......
...@@ -70,7 +70,8 @@ def evaluate(): ...@@ -70,7 +70,8 @@ def evaluate():
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_proc_data, num_threads=args.num_proc_data,
keep_transcription_text=True) keep_transcription_text=True,
num_conv_layers=args.num_conv_layers)
batch_reader = data_generator.batch_reader_creator( batch_reader = data_generator.batch_reader_creator(
manifest_path=args.test_manifest, manifest_path=args.test_manifest,
batch_size=args.batch_size, batch_size=args.batch_size,
...@@ -103,8 +104,9 @@ def evaluate(): ...@@ -103,8 +104,9 @@ def evaluate():
cutoff_top_n=args.cutoff_top_n, cutoff_top_n=args.cutoff_top_n,
vocab_list=vocab_list, vocab_list=vocab_list,
language_model_path=args.lang_model_path, language_model_path=args.lang_model_path,
num_processes=args.num_proc_bsearch) num_processes=args.num_proc_bsearch,
target_transcripts = [transcript for _, transcript in infer_data] feeding_dict=data_generator.feeding)
target_transcripts = [data[1] for data in infer_data]
for target, result in zip(target_transcripts, result_transcripts): for target, result in zip(target_transcripts, result_transcripts):
error_sum += error_rate_func(target, result) error_sum += error_rate_func(target, result)
num_ins += 1 num_ins += 1
......
...@@ -88,7 +88,8 @@ def tune(): ...@@ -88,7 +88,8 @@ def tune():
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_proc_data, num_threads=args.num_proc_data,
keep_transcription_text=True) keep_transcription_text=True,
num_conv_layers=args.num_conv_layers)
audio_data = paddle.layer.data( audio_data = paddle.layer.data(
name="audio_spectrogram", name="audio_spectrogram",
...@@ -96,10 +97,25 @@ def tune(): ...@@ -96,10 +97,25 @@ def tune():
text_data = paddle.layer.data( text_data = paddle.layer.data(
name="transcript_text", name="transcript_text",
type=paddle.data_type.integer_value_sequence(data_generator.vocab_size)) type=paddle.data_type.integer_value_sequence(data_generator.vocab_size))
seq_offset_data = paddle.layer.data(
name='sequence_offset',
type=paddle.data_type.integer_value_sequence(1))
seq_len_data = paddle.layer.data(
name='sequence_length',
type=paddle.data_type.integer_value_sequence(1))
index_range_datas = []
for i in xrange(args.num_rnn_layers):
index_range_datas.append(
paddle.layer.data(
name='conv%d_index_range' % i,
type=paddle.data_type.dense_vector(6)))
output_probs, _ = deep_speech_v2_network( output_probs, _ = deep_speech_v2_network(
audio_data=audio_data, audio_data=audio_data,
text_data=text_data, text_data=text_data,
seq_offset_data=seq_offset_data,
seq_len_data=seq_len_data,
index_range_datas=index_range_datas,
dict_size=data_generator.vocab_size, dict_size=data_generator.vocab_size,
num_conv_layers=args.num_conv_layers, num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
...@@ -156,15 +172,17 @@ def tune(): ...@@ -156,15 +172,17 @@ def tune():
for infer_data in batch_reader(): for infer_data in batch_reader():
if (args.num_batches >= 0) and (cur_batch >= args.num_batches): if (args.num_batches >= 0) and (cur_batch >= args.num_batches):
break break
infer_results = inferer.infer(input=infer_data) infer_results = inferer.infer(input=infer_data,
feeding=data_generator.feeding)
num_steps = len(infer_results) // len(infer_data) start_pos = [0] * (len(infer_data) + 1)
for i in xrange(len(infer_data)):
start_pos[i + 1] = start_pos[i] + infer_data[i][3][0]
probs_split = [ probs_split = [
infer_results[i * num_steps:(i + 1) * num_steps] infer_results[start_pos[i]:start_pos[i + 1]]
for i in xrange(len(infer_data)) for i in xrange(0, len(infer_data))
] ]
target_transcripts = [transcript for _, transcript in infer_data] target_transcripts = [ data[1] for data in infer_data ]
num_ins += len(target_transcripts) num_ins += len(target_transcripts)
# grid search # grid search
......
...@@ -75,13 +75,15 @@ def train(): ...@@ -75,13 +75,15 @@ def train():
max_duration=args.max_duration, max_duration=args.max_duration,
min_duration=args.min_duration, min_duration=args.min_duration,
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_proc_data) num_threads=args.num_proc_data,
num_conv_layers=args.num_conv_layers)
dev_generator = DataGenerator( dev_generator = DataGenerator(
vocab_filepath=args.vocab_path, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_path, mean_std_filepath=args.mean_std_path,
augmentation_config="{}", augmentation_config="{}",
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_proc_data) num_threads=args.num_proc_data,
num_conv_layers=args.num_conv_layers)
train_batch_reader = train_generator.batch_reader_creator( train_batch_reader = train_generator.batch_reader_creator(
manifest_path=args.train_manifest, manifest_path=args.train_manifest,
batch_size=args.batch_size, batch_size=args.batch_size,
......
# 深度结构化语义模型 (Deep Structured Semantic Models, DSSM) # 深度结构化语义模型 (Deep Structured Semantic Models, DSSM)
DSSM使用DNN模型在一个连续的语义空间中学习文本低纬的表示向量,并且建模两个句子间的语义相似度。 DSSM使用DNN模型在一个连续的语义空间中学习文本低纬的表示向量,并且建模两个句子间的语义相似度。本例演示如何使用PaddlePaddle实现一个通用的DSSM 模型,用于建模两个字符串间的语义相似度,模型实现支持通用的数据格式,用户替换数据便可以在真实场景中使用该模型。
本例演示如何使用 PaddlePaddle实现一个通用的DSSM 模型,用于建模两个字符串间的语义相似度,
模型实现支持通用的数据格式,用户替换数据便可以在真实场景中使用该模型。
## 背景介绍 ## 背景介绍
DSSM \[[1](##参考文献)\]是微软研究院13年提出来的经典的语义模型,用于学习两个文本之间的语义距离, DSSM \[[1](##参考文献)\]是微软研究院13年提出来的经典的语义模型,用于学习两个文本之间的语义距离,广义上模型也可以推广和适用如下场景:
广义上模型也可以推广和适用如下场景:
1. CTR预估模型,衡量用户搜索词(Query)与候选网页集合(Documents)之间的相关联程度。 1. CTR预估模型,衡量用户搜索词(Query)与候选网页集合(Documents)之间的相关联程度。
2. 文本相关性,衡量两个字符串间的语义相关程度。 2. 文本相关性,衡量两个字符串间的语义相关程度。
3. 自动推荐,衡量User与被推荐的Item之间的关联程度。 3. 自动推荐,衡量User与被推荐的Item之间的关联程度。
DSSM 已经发展成了一个框架,可以很自然地建模两个记录之间的距离关系, DSSM 已经发展成了一个框架,可以很自然地建模两个记录之间的距离关系,例如对于文本相关性问题,可以用余弦相似度 (cosin similarity) 来刻画语义距离;而对于搜索引擎的结果排序,可以在DSSM上接上Rank损失训练出一个排序模型。
例如对于文本相关性问题,可以用余弦相似度 (cosin similarity) 来刻画语义距离;
而对于搜索引擎的结果排序,可以在DSSM上接上Rank损失训练出一个排序模型。
## 模型简介 ## 模型简介
在原论文\[[1](#参考文献)\]中,DSSM模型用来衡量用户搜索词 Query 和文档集合 Documents 之间隐含的语义关系,模型结构如下 在原论文\[[1](#参考文献)\]中,DSSM模型用来衡量用户搜索词 Query 和文档集合 Documents 之间隐含的语义关系,模型结构如下
...@@ -23,12 +18,9 @@ DSSM 已经发展成了一个框架,可以很自然地建模两个记录之间 ...@@ -23,12 +18,9 @@ DSSM 已经发展成了一个框架,可以很自然地建模两个记录之间
图 1. DSSM 原始结构 图 1. DSSM 原始结构
</p> </p>
其贯彻的思想是, **用DNN将高维特征向量转化为低纬空间的连续向量(图中红色框部分)** 其贯彻的思想是, **用DNN将高维特征向量转化为低纬空间的连续向量(图中红色框部分)****在上层使用cosine similarity来衡量用户搜索词与候选文档间的语义相关性**
**在上层用cosine similarity来衡量用户搜索词与候选文档间的语义相关性**
在最顶层损失函数的设计上,原始模型使用类似Word2Vec中负例采样的方法, 在最顶层损失函数的设计上,原始模型使用类似Word2Vec中负例采样的方法,一个Query会抽取正例 $D+$ 和4个负例 $D-$ 整体上算条件概率用对数似然函数作为损失,这也就是图 1中类似 $P(D_1|Q)$ 的结构,具体细节请参考原论文。
一个Query会抽取正例 $D+$ 和4个负例 $D-$ 整体上算条件概率用对数似然函数作为损失,
这也就是图 1中类似 $P(D_1|Q)$ 的结构,具体细节请参考原论文。
随着后续优化DSSM模型的结构得以简化\[[3](#参考文献)\],演变为: 随着后续优化DSSM模型的结构得以简化\[[3](#参考文献)\],演变为:
...@@ -37,37 +29,30 @@ DSSM 已经发展成了一个框架,可以很自然地建模两个记录之间 ...@@ -37,37 +29,30 @@ DSSM 已经发展成了一个框架,可以很自然地建模两个记录之间
图 2. DSSM通用结构 图 2. DSSM通用结构
</p> </p>
图中的空白方框可以用任何模型替代,比如全连接FC,卷积CNN,RNN等都可以, 图中的空白方框可以用任何模型替代,例如:全连接FC,卷积CNN,RNN等。该模型结构专门用于衡量两个元素(比如字符串)间的语义距离。在实际任务中,DSSM模型会作为基础的积木,搭配上不同的损失函数来实现具体的功能,比如:
该模型结构专门用于衡量两个元素(比如字符串)间的语义距离。
在现实使用中,DSSM模型会作为基础的积木,搭配上不同的损失函数来实现具体的功能,比如
- 在排序学习中,将 图 2 中结构添加 pairwise rank损失,变成一个排序模型 - 在排序学习中,将 图 2 中结构添加 pairwise rank损失,变成一个排序模型
- 在CTR预估中,对点击与否做0,1二元分类,添加交叉熵损失变成一个分类模型 - 在CTR预估中,对点击与否做0,1二元分类,添加交叉熵损失变成一个分类模型
- 在需要对一个子串打分时,可以使用余弦相似度来计算相似度,变成一个回归模型 - 在需要对一个子串打分时,可以使用余弦相似度来计算相似度,变成一个回归模型
本例将尝试面向应用提供一个比较通用的解决方案,在模型任务类型上支持 本例提供一个比较通用的解决方案,在模型任务类型上支持:
- 分类 - 分类
- [-1, 1] 值域内的回归 - [-1, 1] 值域内的回归
- Pairwise-Rank - Pairwise-Rank
在生成低纬语义向量的模型结构上,本模型支持以下三种: 在生成低纬语义向量的模型结构上,支持以下三种:
- FC, 多层全连接层 - FC, 多层全连接层
- CNN,卷积神经网络 - CNN,卷积神经网络
- RNN,递归神经网络 - RNN,递归神经网络
## 模型实现 ## 模型实现
DSSM模型可以拆成三小块实现,分别是左边和右边的DNN,以及顶层的损失函数。 DSSM模型可以拆成三部分:分别是左边和右边的DNN,以及顶层的损失函数。在复杂任务中,左右两边DNN的结构可以不同。在原始论文中左右网络分别学习Query和Document的语义向量,两者数据的数据不同,建议对应定制DNN的结构。
在复杂任务中,左右两边DNN的结构可以是不同的,比如在原始论文中左右分别学习Query和Document的semantic vector,
两者数据的数据不同,建议对应定制DNN的结构。
本例中为了简便和通用,将左右两个DNN的结构都设为相同的,因此只有三个选项FC,CNN,RNN等 **本例中为了简便和通用,将左右两个DNN的结构设为相同,因此只提供三个选项FC、CNN、RNN**
在损失函数的设计方面,也支持三种,分类, 回归, 排序; 损失函数的设计也支持三种类型:分类, 回归, 排序;其中,在回归和排序两种损失中,左右两边的匹配程度通过余弦相似度(cosine similairty)来计算;在分类任务中,类别预测的分布通过softmax计算。
其中,在回归和排序两种损失中,左右两边的匹配程度通过余弦相似度(cossim)来计算;
在分类任务中,类别预测的分布通过softmax计算。
在其它教程中,对上述很多内容都有过详细的介绍,例如: 在其它教程中,对上述很多内容都有过详细的介绍,例如:
...@@ -77,19 +62,17 @@ DSSM模型可以拆成三小块实现,分别是左边和右边的DNN,以及 ...@@ -77,19 +62,17 @@ DSSM模型可以拆成三小块实现,分别是左边和右边的DNN,以及
相关原理在此不再赘述,本文接下来的篇幅主要集中介绍使用PaddlePaddle实现这些结构上。 相关原理在此不再赘述,本文接下来的篇幅主要集中介绍使用PaddlePaddle实现这些结构上。
如图3,回归和分类模型的结构很相似 如图3,回归和分类模型的结构相似:
<p align="center"> <p align="center">
<img src="./images/dssm3.jpg"/><br/><br/> <img src="./images/dssm3.jpg"/><br/><br/>
图 3. DSSM for REGRESSION or CLASSIFICATION 图 3. DSSM for REGRESSION or CLASSIFICATION
</p> </p>
最重要的组成部分包括词向量,图中`(1)`,`(2)`两个低纬向量的学习器(可以用RNN/CNN/FC中的任意一种实现), 最重要的组成部分包括词向量,图中`(1)`,`(2)`两个低纬向量的学习器(可以用RNN/CNN/FC中的任意一种实现),最上层对应的损失函数。
最上层对应的损失函数。
而Pairwise Rank的结构会复杂一些,类似两个 图 4. 中的结构,增加了对应的损失函数:
- 模型总体思想是,用同一个source(源)为左右两个target(目标)分别打分——`(a),(b)`,学习目标是(a),(b)间的大小关系 Pairwise Rank的结构会复杂一些,图 4. 中的结构会出现两次,增加了对应的损失函数,模型总体思想是:
- 给定同一个source(源)为左右两个target(目标)分别打分——`(a),(b)`,学习目标是(a),(b)之间的大小关系
- `(a)``(b)`类似图3中结构,用于给source和target的pair打分 - `(a)``(b)`类似图3中结构,用于给source和target的pair打分
- `(1)``(2)`的结构其实是共用的,都表示同一个source,图中为了表达效果展开成两个 - `(1)``(2)`的结构其实是共用的,都表示同一个source,图中为了表达效果展开成两个
...@@ -98,17 +81,18 @@ DSSM模型可以拆成三小块实现,分别是左边和右边的DNN,以及 ...@@ -98,17 +81,18 @@ DSSM模型可以拆成三小块实现,分别是左边和右边的DNN,以及
图 4. DSSM for Pairwise Rank 图 4. DSSM for Pairwise Rank
</p> </p>
下面是各个部分具体的实现方法,所有的代码均包含在 `./network_conf.py` 中。 下面是各个部分的具体实现,相关代码均包含在 `./network_conf.py` 中。
### 创建文本的词向量表 ### 创建文本的词向量表
```python ```python
def create_embedding(self, input, prefix=''): def create_embedding(self, input, prefix=''):
''' """
Create an embedding table whose name has a `prefix`. Create word embedding. The `prefix` is added in front of the name of
''' embedding"s learnable parameter.
logger.info("create embedding table [%s] which dimention is %d" % """
logger.info("Create embedding table [%s] whose dimention is %d" %
(prefix, self.dnn_dims[0])) (prefix, self.dnn_dims[0]))
emb = paddle.layer.embedding( emb = paddle.layer.embedding(
input=input, input=input,
...@@ -123,14 +107,15 @@ def create_embedding(self, input, prefix=''): ...@@ -123,14 +107,15 @@ def create_embedding(self, input, prefix=''):
```python ```python
def create_cnn(self, emb, prefix=''): def create_cnn(self, emb, prefix=''):
'''
"""
A multi-layer CNN. A multi-layer CNN.
:param emb: The word embedding.
:type emb: paddle.layer
:param prefix: The prefix will be added to of layers' names.
:type prefix: str
"""
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `cnn` parts.
'''
def create_conv(context_len, hidden_size, prefix): def create_conv(context_len, hidden_size, prefix):
key = "%s_%d_%d" % (prefix, context_len, hidden_size) key = "%s_%d_%d" % (prefix, context_len, hidden_size)
conv = paddle.networks.sequence_conv_pool( conv = paddle.networks.sequence_conv_pool(
...@@ -138,21 +123,18 @@ def create_cnn(self, emb, prefix=''): ...@@ -138,21 +123,18 @@ def create_cnn(self, emb, prefix=''):
context_len=context_len, context_len=context_len,
hidden_size=hidden_size, hidden_size=hidden_size,
# set parameter attr for parameter sharing # set parameter attr for parameter sharing
context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'), context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"),
fc_param_attr=ParamAttr(name=key + '_fc.w'), fc_param_attr=ParamAttr(name=key + "_fc.w"),
fc_bias_attr=ParamAttr(name=key + '_fc.b'), fc_bias_attr=ParamAttr(name=key + "_fc.b"),
pool_bias_attr=ParamAttr(name=key + '_pool.b')) pool_bias_attr=ParamAttr(name=key + "_pool.b"))
return conv return conv
logger.info('create a sequence_conv_pool which context width is 3')
conv_3 = create_conv(3, self.dnn_dims[1], "cnn") conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
logger.info('create a sequence_conv_pool which context width is 4')
conv_4 = create_conv(4, self.dnn_dims[1], "cnn") conv_4 = create_conv(4, self.dnn_dims[1], "cnn")
return conv_3, conv_4 return conv_3, conv_4
``` ```
CNN 接受 embedding table输出的词向量序列,通过卷积和池化操作捕捉到原始句子的关键信息, CNN 接受词向量序列,通过卷积和池化操作捕捉到原始句子的关键信息,最终输出一个语义向量(可以认为是句子向量)。
最终输出一个语义向量(可以认为是句子向量)。
本例的实现中,分别使用了窗口长度为3和4的CNN学到的句子向量按元素求和得到最终的句子向量。 本例的实现中,分别使用了窗口长度为3和4的CNN学到的句子向量按元素求和得到最终的句子向量。
...@@ -162,9 +144,9 @@ RNN很适合学习变长序列的信息,使用RNN来学习句子的信息几 ...@@ -162,9 +144,9 @@ RNN很适合学习变长序列的信息,使用RNN来学习句子的信息几
```python ```python
def create_rnn(self, emb, prefix=''): def create_rnn(self, emb, prefix=''):
''' """
A GRU sentence vector learner. A GRU sentence vector learner.
''' """
gru = paddle.networks.simple_gru( gru = paddle.networks.simple_gru(
input=emb, input=emb,
size=self.dnn_dims[1], size=self.dnn_dims[1],
...@@ -176,18 +158,19 @@ def create_rnn(self, emb, prefix=''): ...@@ -176,18 +158,19 @@ def create_rnn(self, emb, prefix=''):
return sent_vec return sent_vec
``` ```
### FC 结构实现 ### 多层全连接网络FC
```python ```python
def create_fc(self, emb, prefix=''): def create_fc(self, emb, prefix=''):
'''
"""
A multi-layer fully connected neural networks. A multi-layer fully connected neural networks.
:param emb: The output of the embedding layer
:type emb: paddle.layer
:param prefix: A prefix will be added to the layers' names.
:type prefix: str
"""
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `fc` parts.
'''
_input_layer = paddle.layer.pooling( _input_layer = paddle.layer.pooling(
input=emb, pooling_type=paddle.pooling.Max()) input=emb, pooling_type=paddle.pooling.Max())
fc = paddle.layer.fc( fc = paddle.layer.fc(
...@@ -198,21 +181,17 @@ def create_fc(self, emb, prefix=''): ...@@ -198,21 +181,17 @@ def create_fc(self, emb, prefix=''):
return fc return fc
``` ```
在构建FC时需要首先使用`paddle.layer.pooling` 对词向量序列进行最大池化操作,将边长序列转化为一个固定维度向量, 在构建全连接网络时首先使用`paddle.layer.pooling` 对词向量序列进行最大池化操作,将边长序列转化为一个固定维度向量,作为整个句子的语义表达,使用最大池化能够降低句子长度对句向量表达的影响。
作为整个句子的语义表达,使用最大池化能够降低句子长度对句向量表达的影响。
### 多层DNN实现 ### 多层DNN
在 CNN/DNN/FC提取出 semantic vector后,在上层可继续接多层FC来实现深层DNN结构。 在 CNN/DNN/FC提取出 semantic vector后,在上层可继续接多层FC来实现深层DNN结构。
```python ```python
def create_dnn(self, sent_vec, prefix): def create_dnn(self, sent_vec, prefix):
# if more than three layers exists, a fc layer will be added.
if len(self.dnn_dims) > 1: if len(self.dnn_dims) > 1:
_input_layer = sent_vec _input_layer = sent_vec
for id, dim in enumerate(self.dnn_dims[1:]): for id, dim in enumerate(self.dnn_dims[1:]):
name = "%s_fc_%d_%d" % (prefix, id, dim) name = "%s_fc_%d_%d" % (prefix, id, dim)
logger.info("create fc layer [%s] which dimention is %d" %
(name, dim))
fc = paddle.layer.fc( fc = paddle.layer.fc(
input=_input_layer, input=_input_layer,
size=dim, size=dim,
...@@ -224,119 +203,13 @@ def create_dnn(self, sent_vec, prefix): ...@@ -224,119 +203,13 @@ def create_dnn(self, sent_vec, prefix):
return _input_layer return _input_layer
``` ```
### 分类或回归实现 ### 分类及回归
分类和回归的结构比较相似,因此可以用一个函数创建出来 分类和回归的结构比较相似,具体实现请参考[network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py)中的
`_build_classification_or_regression_model` 函数。
```python ### Pairwise Rank
def _build_classification_or_regression_model(self, is_classification): Pairwise Rank复用上面的DNN结构,同一个source对两个target求相似度打分,如果左边的target打分高,预测为1,否则预测为 0。实现请参考 [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) 中的`_build_rank_model` 函数。
'''
Build a classification/regression model, and the cost is returned.
A Classification has 3 inputs:
- source sentence
- target sentence
- classification label
'''
# prepare inputs.
assert self.class_num
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
target = paddle.layer.data(
name='target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input',
type=paddle.data_type.integer_value(self.class_num)
if is_classification else paddle.data_type.dense_input)
prefixs = '_ _'.split(
) if self.share_semantic_generator else 'source target'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target'.split()
word_vecs = []
for id, input in enumerate([source, target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
if is_classification:
concated_vector = paddle.layer.concat(semantics)
prediction = paddle.layer.fc(
input=concated_vector,
size=self.class_num,
act=paddle.activation.Softmax())
cost = paddle.layer.classification_cost(
input=prediction, label=label)
else:
prediction = paddle.layer.cos_sim(*semantics)
cost = paddle.layer.square_error_cost(prediction, label)
if not self.is_infer:
return cost, prediction, label
return prediction
```
### Pairwise Rank实现
Pairwise Rank复用上面的DNN结构,同一个source对两个target求相似度打分,
如果左边的target打分高,预测为1,否则预测为 0。
```python
def _build_rank_model(self):
'''
Build a pairwise rank model, and the cost is returned.
A pairwise rank model has 3 inputs:
- source sentence
- left_target sentence
- right_target sentence
- label, 1 if left_target should be sorted in front of right_target, otherwise 0.
'''
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
left_target = paddle.layer.data(
name='left_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
right_target = paddle.layer.data(
name='right_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input', type=paddle.data_type.integer_value(1))
prefixs = '_ _ _'.split(
) if self.share_semantic_generator else 'source target target'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target target'.split()
word_vecs = []
for id, input in enumerate([source, left_target, right_target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
# cossim score of source and left_target
left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
# cossim score of source and right target
right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
# rank cost
cost = paddle.layer.rank_cost(left_score, right_score, label=label)
# prediction = left_score - right_score
# but this operator is not supported currently.
# so AUC will not used.
return cost, None, None
```
## 数据格式 ## 数据格式
`./data` 中有简单的示例数据 `./data` 中有简单的示例数据
...@@ -371,7 +244,6 @@ def _build_rank_model(self): ...@@ -371,7 +244,6 @@ def _build_rank_model(self):
6 10 \t 8 3 1 \t 1 6 10 \t 8 3 1 \t 1
``` ```
### 排序的数据格式 ### 排序的数据格式
``` ```
# 4 fields each line: # 4 fields each line:
...@@ -391,68 +263,11 @@ def _build_rank_model(self): ...@@ -391,68 +263,11 @@ def _build_rank_model(self):
## 执行训练 ## 执行训练
可以直接执行 `python train.py -y 0 --model_arch 0` 使用 `./data/classification` 目录里简单的数据来训练一个分类的FC模型。 可以直接执行 `python train.py -y 0 --model_arch 0` 使用 `./data/classification` 目录里的实例数据来测试能否直接运行训练分类FC模型。
其他模型结构也可以通过命令行实现定制,详细命令行参数如下
``` 其他模型结构也可以通过命令行实现定制,详细命令行参数请执行 `python train.py --help`进行查阅。
usage: train.py [-h] [-i TRAIN_DATA_PATH] [-t TEST_DATA_PATH]
[-s SOURCE_DIC_PATH] [--target_dic_path TARGET_DIC_PATH]
[-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE -a MODEL_ARCH
[--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
[--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
[--num_workers NUM_WORKERS] [--use_gpu USE_GPU] [-c CLASS_NUM]
[--model_output_prefix MODEL_OUTPUT_PREFIX]
[-g NUM_BATCHES_TO_LOG] [-e NUM_BATCHES_TO_TEST]
[-z NUM_BATCHES_TO_SAVE_MODEL]
PaddlePaddle DSSM example
optional arguments:
-h, --help show this help message and exit
-i TRAIN_DATA_PATH, --train_data_path TRAIN_DATA_PATH
path of training dataset
-t TEST_DATA_PATH, --test_data_path TEST_DATA_PATH
path of testing dataset
-s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
path of the source's word dic
--target_dic_path TARGET_DIC_PATH
path of the target's word dic, if not set, the
`source_dic_path` will be used
-b BATCH_SIZE, --batch_size BATCH_SIZE
size of mini-batch (default:32)
-p NUM_PASSES, --num_passes NUM_PASSES
number of passes to run(default:10)
-y MODEL_TYPE, --model_type MODEL_TYPE
model type, 0 for classification, 1 for pairwise rank,
2 for regression (default: classification)
-a MODEL_ARCH, --model_arch MODEL_ARCH
model architecture, 1 for CNN, 0 for FC, 2 for RNN
--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
whether to share network parameters between source and
target
--share_embed SHARE_EMBED
whether to share word embedding between source and
target
--dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32',
which means create a 4-layer dnn, demention of each
layer is 256, 128, 64 and 32
--num_workers NUM_WORKERS
num worker threads, default 1
--use_gpu USE_GPU whether to use GPU devices (default: False)
-c CLASS_NUM, --class_num CLASS_NUM
number of categories for classification task.
--model_output_prefix MODEL_OUTPUT_PREFIX
prefix of the path for model to store, (default: ./)
-g NUM_BATCHES_TO_LOG, --num_batches_to_log NUM_BATCHES_TO_LOG
number of batches to output train log, (default: 100)
-e NUM_BATCHES_TO_TEST, --num_batches_to_test NUM_BATCHES_TO_TEST
number of batches to test, (default: 200)
-z NUM_BATCHES_TO_SAVE_MODEL, --num_batches_to_save_model NUM_BATCHES_TO_SAVE_MODEL
number of batches to output model, (default: 400)
```
重要的参数描述如下 这里介绍最重要的几个参数:
- `train_data_path` 训练数据路径 - `train_data_path` 训练数据路径
- `test_data_path` 测试数据路局,可以不设置 - `test_data_path` 测试数据路局,可以不设置
...@@ -462,49 +277,8 @@ optional arguments: ...@@ -462,49 +277,8 @@ optional arguments:
- `model_arch` 模型结构,FC 0, CNN 1, RNN 2 - `model_arch` 模型结构,FC 0, CNN 1, RNN 2
- `dnn_dims` 模型各层的维度设置,默认为 `256,128,64,32`,即模型有4层,各层维度如上设置 - `dnn_dims` 模型各层的维度设置,默认为 `256,128,64,32`,即模型有4层,各层维度如上设置
## 用训练好的模型预测 ## 使用训练好的模型预测
``` 详细命令行参数请执行 `python train.py --help`进行查阅。重要参数解释如下:
usage: infer.py [-h] --model_path MODEL_PATH -i DATA_PATH -o
PREDICTION_OUTPUT_PATH -y MODEL_TYPE [-s SOURCE_DIC_PATH]
[--target_dic_path TARGET_DIC_PATH] -a MODEL_ARCH
[--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
[--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
[-c CLASS_NUM]
PaddlePaddle DSSM infer
optional arguments:
-h, --help show this help message and exit
--model_path MODEL_PATH
path of model parameters file
-i DATA_PATH, --data_path DATA_PATH
path of the dataset to infer
-o PREDICTION_OUTPUT_PATH, --prediction_output_path PREDICTION_OUTPUT_PATH
path to output the prediction
-y MODEL_TYPE, --model_type MODEL_TYPE
model type, 0 for classification, 1 for pairwise rank,
2 for regression (default: classification)
-s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
path of the source's word dic
--target_dic_path TARGET_DIC_PATH
path of the target's word dic, if not set, the
`source_dic_path` will be used
-a MODEL_ARCH, --model_arch MODEL_ARCH
model architecture, 1 for CNN, 0 for FC, 2 for RNN
--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
whether to share network parameters between source and
target
--share_embed SHARE_EMBED
whether to share word embedding between source and
target
--dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32',
which means create a 4-layer dnn, demention of each
layer is 256, 128, 64 and 32
-c CLASS_NUM, --class_num CLASS_NUM
number of categories for classification task.
```
部分参数可以参考 `train.py`,重要参数解释如下
- `data_path` 需要预测的数据路径 - `data_path` 需要预测的数据路径
- `prediction_output_path` 预测的输出路径 - `prediction_output_path` 预测的输出路径
......
...@@ -65,10 +65,11 @@ In below, we describe how to train DSSM model in PaddlePaddle. All the codes are ...@@ -65,10 +65,11 @@ In below, we describe how to train DSSM model in PaddlePaddle. All the codes are
### Create a word vector table for the text ### Create a word vector table for the text
```python ```python
def create_embedding(self, input, prefix=''): def create_embedding(self, input, prefix=''):
''' """
Create an embedding table whose name has a `prefix`. Create word embedding. The `prefix` is added in front of the name of
''' embedding"s learnable parameter.
logger.info("create embedding table [%s] which dimention is %d" % """
logger.info("Create embedding table [%s] whose dimention is %d" %
(prefix, self.dnn_dims[0])) (prefix, self.dnn_dims[0]))
emb = paddle.layer.embedding( emb = paddle.layer.embedding(
input=input, input=input,
...@@ -82,14 +83,15 @@ Since the input (embedding table) is a list of the IDs of the words correspondin ...@@ -82,14 +83,15 @@ Since the input (embedding table) is a list of the IDs of the words correspondin
### CNN implementation ### CNN implementation
```python ```python
def create_cnn(self, emb, prefix=''): def create_cnn(self, emb, prefix=''):
'''
"""
A multi-layer CNN. A multi-layer CNN.
:param emb: The word embedding.
:type emb: paddle.layer
:param prefix: The prefix will be added to of layers' names.
:type prefix: str
"""
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `cnn` parts.
'''
def create_conv(context_len, hidden_size, prefix): def create_conv(context_len, hidden_size, prefix):
key = "%s_%d_%d" % (prefix, context_len, hidden_size) key = "%s_%d_%d" % (prefix, context_len, hidden_size)
conv = paddle.networks.sequence_conv_pool( conv = paddle.networks.sequence_conv_pool(
...@@ -97,15 +99,13 @@ def create_cnn(self, emb, prefix=''): ...@@ -97,15 +99,13 @@ def create_cnn(self, emb, prefix=''):
context_len=context_len, context_len=context_len,
hidden_size=hidden_size, hidden_size=hidden_size,
# set parameter attr for parameter sharing # set parameter attr for parameter sharing
context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'), context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"),
fc_param_attr=ParamAttr(name=key + '_fc.w'), fc_param_attr=ParamAttr(name=key + "_fc.w"),
fc_bias_attr=ParamAttr(name=key + '_fc.b'), fc_bias_attr=ParamAttr(name=key + "_fc.b"),
pool_bias_attr=ParamAttr(name=key + '_pool.b')) pool_bias_attr=ParamAttr(name=key + "_pool.b"))
return conv return conv
logger.info('create a sequence_conv_pool which context width is 3')
conv_3 = create_conv(3, self.dnn_dims[1], "cnn") conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
logger.info('create a sequence_conv_pool which context width is 4')
conv_4 = create_conv(4, self.dnn_dims[1], "cnn") conv_4 = create_conv(4, self.dnn_dims[1], "cnn")
return conv_3, conv_4 return conv_3, conv_4
``` ```
...@@ -118,9 +118,9 @@ RNN is suitable for learning variable length of the information ...@@ -118,9 +118,9 @@ RNN is suitable for learning variable length of the information
```python ```python
def create_rnn(self, emb, prefix=''): def create_rnn(self, emb, prefix=''):
''' """
A GRU sentence vector learner. A GRU sentence vector learner.
''' """
gru = paddle.networks.simple_gru( gru = paddle.networks.simple_gru(
input=emb, input=emb,
size=self.dnn_dims[1], size=self.dnn_dims[1],
...@@ -136,14 +136,15 @@ def create_rnn(self, emb, prefix=''): ...@@ -136,14 +136,15 @@ def create_rnn(self, emb, prefix=''):
```python ```python
def create_fc(self, emb, prefix=''): def create_fc(self, emb, prefix=''):
'''
"""
A multi-layer fully connected neural networks. A multi-layer fully connected neural networks.
:param emb: The output of the embedding layer
:type emb: paddle.layer
:param prefix: A prefix will be added to the layers' names.
:type prefix: str
"""
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `fc` parts.
'''
_input_layer = paddle.layer.pooling( _input_layer = paddle.layer.pooling(
input=emb, pooling_type=paddle.pooling.Max()) input=emb, pooling_type=paddle.pooling.Max())
fc = paddle.layer.fc( fc = paddle.layer.fc(
...@@ -160,13 +161,10 @@ In the construction of FC, we use `paddle.layer.pooling` for the maximum pooling ...@@ -160,13 +161,10 @@ In the construction of FC, we use `paddle.layer.pooling` for the maximum pooling
```python ```python
def create_dnn(self, sent_vec, prefix): def create_dnn(self, sent_vec, prefix):
# if more than three layers exists, a fc layer will be added.
if len(self.dnn_dims) > 1: if len(self.dnn_dims) > 1:
_input_layer = sent_vec _input_layer = sent_vec
for id, dim in enumerate(self.dnn_dims[1:]): for id, dim in enumerate(self.dnn_dims[1:]):
name = "%s_fc_%d_%d" % (prefix, id, dim) name = "%s_fc_%d_%d" % (prefix, id, dim)
logger.info("create fc layer [%s] which dimention is %d" %
(name, dim))
fc = paddle.layer.fc( fc = paddle.layer.fc(
input=_input_layer, input=_input_layer,
size=dim, size=dim,
...@@ -180,117 +178,12 @@ def create_dnn(self, sent_vec, prefix): ...@@ -180,117 +178,12 @@ def create_dnn(self, sent_vec, prefix):
### Classification / Regression ### Classification / Regression
The structure of classification and regression is similar. Below function can be used for both tasks. The structure of classification and regression is similar. Below function can be used for both tasks.
Please check the function `_build_classification_or_regression_model` in [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) for detail implementation.
```python
def _build_classification_or_regression_model(self, is_classification):
'''
Build a classification/regression model, and the cost is returned.
A Classification has 3 inputs:
- source sentence
- target sentence
- classification label
'''
# prepare inputs.
assert self.class_num
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
target = paddle.layer.data(
name='target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input',
type=paddle.data_type.integer_value(self.class_num)
if is_classification else paddle.data_type.dense_input)
prefixs = '_ _'.split(
) if self.share_semantic_generator else 'source target'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target'.split()
word_vecs = []
for id, input in enumerate([source, target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
if is_classification:
concated_vector = paddle.layer.concat(semantics)
prediction = paddle.layer.fc(
input=concated_vector,
size=self.class_num,
act=paddle.activation.Softmax())
cost = paddle.layer.classification_cost(
input=prediction, label=label)
else:
prediction = paddle.layer.cos_sim(*semantics)
cost = paddle.layer.square_error_cost(prediction, label)
if not self.is_infer:
return cost, prediction, label
return prediction
```
### Pairwise Rank ### Pairwise Rank
Please check the function `_build_rank_model` in [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) for implementation.
```python
def _build_rank_model(self):
'''
Build a pairwise rank model, and the cost is returned.
A pairwise rank model has 3 inputs:
- source sentence
- left_target sentence
- right_target sentence
- label, 1 if left_target should be sorted in front of right_target, otherwise 0.
'''
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
left_target = paddle.layer.data(
name='left_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
right_target = paddle.layer.data(
name='right_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input', type=paddle.data_type.integer_value(1))
prefixs = '_ _ _'.split(
) if self.share_semantic_generator else 'source target target'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target target'.split()
word_vecs = []
for id, input in enumerate([source, left_target, right_target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
# cossim score of source and left_target
left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
# cossim score of source and right target
right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
# rank cost
cost = paddle.layer.rank_cost(left_score, right_score, label=label)
# prediction = left_score - right_score
# but this operator is not supported currently.
# so AUC will not used.
return cost, None, None
```
## Data Format ## Data Format
Below is a simple example for the data in `./data` Below is a simple example for the data in `./data`
...@@ -347,67 +240,7 @@ The example of this format is as follows. ...@@ -347,67 +240,7 @@ The example of this format is as follows.
## Training ## Training
We use `python train.py -y 0 --model_arch 0` with the data in `./data/classification` to train a DSSM model for classification. We use `python train.py -y 0 --model_arch 0` with the data in `./data/classification` to train a DSSM model for classification. The paremeters to execute the script `train.py` can be found by execution `python infer.py --help`. Some important parameters are:
```
usage: train.py [-h] [-i TRAIN_DATA_PATH] [-t TEST_DATA_PATH]
[-s SOURCE_DIC_PATH] [--target_dic_path TARGET_DIC_PATH]
[-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE -a MODEL_ARCH
[--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
[--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
[--num_workers NUM_WORKERS] [--use_gpu USE_GPU] [-c CLASS_NUM]
[--model_output_prefix MODEL_OUTPUT_PREFIX]
[-g NUM_BATCHES_TO_LOG] [-e NUM_BATCHES_TO_TEST]
[-z NUM_BATCHES_TO_SAVE_MODEL]
PaddlePaddle DSSM example
optional arguments:
-h, --help show this help message and exit
-i TRAIN_DATA_PATH, --train_data_path TRAIN_DATA_PATH
path of training dataset
-t TEST_DATA_PATH, --test_data_path TEST_DATA_PATH
path of testing dataset
-s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
path of the source's word dic
--target_dic_path TARGET_DIC_PATH
path of the target's word dic, if not set, the
`source_dic_path` will be used
-b BATCH_SIZE, --batch_size BATCH_SIZE
size of mini-batch (default:32)
-p NUM_PASSES, --num_passes NUM_PASSES
number of passes to run(default:10)
-y MODEL_TYPE, --model_type MODEL_TYPE
model type, 0 for classification, 1 for pairwise rank,
2 for regression (default: classification)
-a MODEL_ARCH, --model_arch MODEL_ARCH
model architecture, 1 for CNN, 0 for FC, 2 for RNN
--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
whether to share network parameters between source and
target
--share_embed SHARE_EMBED
whether to share word embedding between source and
target
--dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32',
which means create a 4-layer dnn, demention of each
layer is 256, 128, 64 and 32
--num_workers NUM_WORKERS
num worker threads, default 1
--use_gpu USE_GPU whether to use GPU devices (default: False)
-c CLASS_NUM, --class_num CLASS_NUM
number of categories for classification task.
--model_output_prefix MODEL_OUTPUT_PREFIX
prefix of the path for model to store, (default: ./)
-g NUM_BATCHES_TO_LOG, --num_batches_to_log NUM_BATCHES_TO_LOG
number of batches to output train log, (default: 100)
-e NUM_BATCHES_TO_TEST, --num_batches_to_test NUM_BATCHES_TO_TEST
number of batches to test, (default: 200)
-z NUM_BATCHES_TO_SAVE_MODEL, --num_batches_to_save_model NUM_BATCHES_TO_SAVE_MODEL
number of batches to output model, (default: 400)
```
Parameter description:
- `train_data_path` Training data path - `train_data_path` Training data path
- `test_data_path` Test data path, optional - `test_data_path` Test data path, optional
...@@ -418,48 +251,8 @@ Parameter description: ...@@ -418,48 +251,8 @@ Parameter description:
- `dnn_dims` The dimension of each layer of the model is set, the default is `256,128,64,32`,with 4 layers. - `dnn_dims` The dimension of each layer of the model is set, the default is `256,128,64,32`,with 4 layers.
## To predict using the trained model ## To predict using the trained model
```
usage: infer.py [-h] --model_path MODEL_PATH -i DATA_PATH -o
PREDICTION_OUTPUT_PATH -y MODEL_TYPE [-s SOURCE_DIC_PATH]
[--target_dic_path TARGET_DIC_PATH] -a MODEL_ARCH
[--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
[--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
[-c CLASS_NUM]
PaddlePaddle DSSM infer
optional arguments:
-h, --help show this help message and exit
--model_path MODEL_PATH
path of model parameters file
-i DATA_PATH, --data_path DATA_PATH
path of the dataset to infer
-o PREDICTION_OUTPUT_PATH, --prediction_output_path PREDICTION_OUTPUT_PATH
path to output the prediction
-y MODEL_TYPE, --model_type MODEL_TYPE
model type, 0 for classification, 1 for pairwise rank,
2 for regression (default: classification)
-s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
path of the source's word dic
--target_dic_path TARGET_DIC_PATH
path of the target's word dic, if not set, the
`source_dic_path` will be used
-a MODEL_ARCH, --model_arch MODEL_ARCH
model architecture, 1 for CNN, 0 for FC, 2 for RNN
--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
whether to share network parameters between source and
target
--share_embed SHARE_EMBED
whether to share word embedding between source and
target
--dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32',
which means create a 4-layer dnn, demention of each
layer is 256, 128, 64 and 32
-c CLASS_NUM, --class_num CLASS_NUM
number of categories for classification task.
```
Important parameters are The paremeters to execute the script `infer.py` can be found by execution `python infer.py --help`. Some important parameters are:
- `data_path` Path for the data to predict - `data_path` Path for the data to predict
- `prediction_output_path` Prediction output path - `prediction_output_path` Prediction output path
......
...@@ -107,10 +107,11 @@ In below, we describe how to train DSSM model in PaddlePaddle. All the codes are ...@@ -107,10 +107,11 @@ In below, we describe how to train DSSM model in PaddlePaddle. All the codes are
### Create a word vector table for the text ### Create a word vector table for the text
```python ```python
def create_embedding(self, input, prefix=''): def create_embedding(self, input, prefix=''):
''' """
Create an embedding table whose name has a `prefix`. Create word embedding. The `prefix` is added in front of the name of
''' embedding"s learnable parameter.
logger.info("create embedding table [%s] which dimention is %d" % """
logger.info("Create embedding table [%s] whose dimention is %d" %
(prefix, self.dnn_dims[0])) (prefix, self.dnn_dims[0]))
emb = paddle.layer.embedding( emb = paddle.layer.embedding(
input=input, input=input,
...@@ -124,14 +125,15 @@ Since the input (embedding table) is a list of the IDs of the words correspondin ...@@ -124,14 +125,15 @@ Since the input (embedding table) is a list of the IDs of the words correspondin
### CNN implementation ### CNN implementation
```python ```python
def create_cnn(self, emb, prefix=''): def create_cnn(self, emb, prefix=''):
'''
"""
A multi-layer CNN. A multi-layer CNN.
:param emb: The word embedding.
:type emb: paddle.layer
:param prefix: The prefix will be added to of layers' names.
:type prefix: str
"""
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `cnn` parts.
'''
def create_conv(context_len, hidden_size, prefix): def create_conv(context_len, hidden_size, prefix):
key = "%s_%d_%d" % (prefix, context_len, hidden_size) key = "%s_%d_%d" % (prefix, context_len, hidden_size)
conv = paddle.networks.sequence_conv_pool( conv = paddle.networks.sequence_conv_pool(
...@@ -139,15 +141,13 @@ def create_cnn(self, emb, prefix=''): ...@@ -139,15 +141,13 @@ def create_cnn(self, emb, prefix=''):
context_len=context_len, context_len=context_len,
hidden_size=hidden_size, hidden_size=hidden_size,
# set parameter attr for parameter sharing # set parameter attr for parameter sharing
context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'), context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"),
fc_param_attr=ParamAttr(name=key + '_fc.w'), fc_param_attr=ParamAttr(name=key + "_fc.w"),
fc_bias_attr=ParamAttr(name=key + '_fc.b'), fc_bias_attr=ParamAttr(name=key + "_fc.b"),
pool_bias_attr=ParamAttr(name=key + '_pool.b')) pool_bias_attr=ParamAttr(name=key + "_pool.b"))
return conv return conv
logger.info('create a sequence_conv_pool which context width is 3')
conv_3 = create_conv(3, self.dnn_dims[1], "cnn") conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
logger.info('create a sequence_conv_pool which context width is 4')
conv_4 = create_conv(4, self.dnn_dims[1], "cnn") conv_4 = create_conv(4, self.dnn_dims[1], "cnn")
return conv_3, conv_4 return conv_3, conv_4
``` ```
...@@ -160,9 +160,9 @@ RNN is suitable for learning variable length of the information ...@@ -160,9 +160,9 @@ RNN is suitable for learning variable length of the information
```python ```python
def create_rnn(self, emb, prefix=''): def create_rnn(self, emb, prefix=''):
''' """
A GRU sentence vector learner. A GRU sentence vector learner.
''' """
gru = paddle.networks.simple_gru( gru = paddle.networks.simple_gru(
input=emb, input=emb,
size=self.dnn_dims[1], size=self.dnn_dims[1],
...@@ -178,14 +178,15 @@ def create_rnn(self, emb, prefix=''): ...@@ -178,14 +178,15 @@ def create_rnn(self, emb, prefix=''):
```python ```python
def create_fc(self, emb, prefix=''): def create_fc(self, emb, prefix=''):
'''
"""
A multi-layer fully connected neural networks. A multi-layer fully connected neural networks.
:param emb: The output of the embedding layer
:type emb: paddle.layer
:param prefix: A prefix will be added to the layers' names.
:type prefix: str
"""
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `fc` parts.
'''
_input_layer = paddle.layer.pooling( _input_layer = paddle.layer.pooling(
input=emb, pooling_type=paddle.pooling.Max()) input=emb, pooling_type=paddle.pooling.Max())
fc = paddle.layer.fc( fc = paddle.layer.fc(
...@@ -202,13 +203,10 @@ In the construction of FC, we use `paddle.layer.pooling` for the maximum pooling ...@@ -202,13 +203,10 @@ In the construction of FC, we use `paddle.layer.pooling` for the maximum pooling
```python ```python
def create_dnn(self, sent_vec, prefix): def create_dnn(self, sent_vec, prefix):
# if more than three layers exists, a fc layer will be added.
if len(self.dnn_dims) > 1: if len(self.dnn_dims) > 1:
_input_layer = sent_vec _input_layer = sent_vec
for id, dim in enumerate(self.dnn_dims[1:]): for id, dim in enumerate(self.dnn_dims[1:]):
name = "%s_fc_%d_%d" % (prefix, id, dim) name = "%s_fc_%d_%d" % (prefix, id, dim)
logger.info("create fc layer [%s] which dimention is %d" %
(name, dim))
fc = paddle.layer.fc( fc = paddle.layer.fc(
input=_input_layer, input=_input_layer,
size=dim, size=dim,
...@@ -222,117 +220,12 @@ def create_dnn(self, sent_vec, prefix): ...@@ -222,117 +220,12 @@ def create_dnn(self, sent_vec, prefix):
### Classification / Regression ### Classification / Regression
The structure of classification and regression is similar. Below function can be used for both tasks. The structure of classification and regression is similar. Below function can be used for both tasks.
Please check the function `_build_classification_or_regression_model` in [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) for detail implementation.
```python
def _build_classification_or_regression_model(self, is_classification):
'''
Build a classification/regression model, and the cost is returned.
A Classification has 3 inputs:
- source sentence
- target sentence
- classification label
'''
# prepare inputs.
assert self.class_num
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
target = paddle.layer.data(
name='target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input',
type=paddle.data_type.integer_value(self.class_num)
if is_classification else paddle.data_type.dense_input)
prefixs = '_ _'.split(
) if self.share_semantic_generator else 'source target'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target'.split()
word_vecs = []
for id, input in enumerate([source, target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
if is_classification:
concated_vector = paddle.layer.concat(semantics)
prediction = paddle.layer.fc(
input=concated_vector,
size=self.class_num,
act=paddle.activation.Softmax())
cost = paddle.layer.classification_cost(
input=prediction, label=label)
else:
prediction = paddle.layer.cos_sim(*semantics)
cost = paddle.layer.square_error_cost(prediction, label)
if not self.is_infer:
return cost, prediction, label
return prediction
```
### Pairwise Rank ### Pairwise Rank
Please check the function `_build_rank_model` in [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) for implementation.
```python
def _build_rank_model(self):
'''
Build a pairwise rank model, and the cost is returned.
A pairwise rank model has 3 inputs:
- source sentence
- left_target sentence
- right_target sentence
- label, 1 if left_target should be sorted in front of right_target, otherwise 0.
'''
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
left_target = paddle.layer.data(
name='left_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
right_target = paddle.layer.data(
name='right_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input', type=paddle.data_type.integer_value(1))
prefixs = '_ _ _'.split(
) if self.share_semantic_generator else 'source target target'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target target'.split()
word_vecs = []
for id, input in enumerate([source, left_target, right_target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
# cossim score of source and left_target
left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
# cossim score of source and right target
right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
# rank cost
cost = paddle.layer.rank_cost(left_score, right_score, label=label)
# prediction = left_score - right_score
# but this operator is not supported currently.
# so AUC will not used.
return cost, None, None
```
## Data Format ## Data Format
Below is a simple example for the data in `./data` Below is a simple example for the data in `./data`
...@@ -389,67 +282,7 @@ The example of this format is as follows. ...@@ -389,67 +282,7 @@ The example of this format is as follows.
## Training ## Training
We use `python train.py -y 0 --model_arch 0` with the data in `./data/classification` to train a DSSM model for classification. We use `python train.py -y 0 --model_arch 0` with the data in `./data/classification` to train a DSSM model for classification. The paremeters to execute the script `train.py` can be found by execution `python infer.py --help`. Some important parameters are:
```
usage: train.py [-h] [-i TRAIN_DATA_PATH] [-t TEST_DATA_PATH]
[-s SOURCE_DIC_PATH] [--target_dic_path TARGET_DIC_PATH]
[-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE -a MODEL_ARCH
[--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
[--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
[--num_workers NUM_WORKERS] [--use_gpu USE_GPU] [-c CLASS_NUM]
[--model_output_prefix MODEL_OUTPUT_PREFIX]
[-g NUM_BATCHES_TO_LOG] [-e NUM_BATCHES_TO_TEST]
[-z NUM_BATCHES_TO_SAVE_MODEL]
PaddlePaddle DSSM example
optional arguments:
-h, --help show this help message and exit
-i TRAIN_DATA_PATH, --train_data_path TRAIN_DATA_PATH
path of training dataset
-t TEST_DATA_PATH, --test_data_path TEST_DATA_PATH
path of testing dataset
-s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
path of the source's word dic
--target_dic_path TARGET_DIC_PATH
path of the target's word dic, if not set, the
`source_dic_path` will be used
-b BATCH_SIZE, --batch_size BATCH_SIZE
size of mini-batch (default:32)
-p NUM_PASSES, --num_passes NUM_PASSES
number of passes to run(default:10)
-y MODEL_TYPE, --model_type MODEL_TYPE
model type, 0 for classification, 1 for pairwise rank,
2 for regression (default: classification)
-a MODEL_ARCH, --model_arch MODEL_ARCH
model architecture, 1 for CNN, 0 for FC, 2 for RNN
--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
whether to share network parameters between source and
target
--share_embed SHARE_EMBED
whether to share word embedding between source and
target
--dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32',
which means create a 4-layer dnn, demention of each
layer is 256, 128, 64 and 32
--num_workers NUM_WORKERS
num worker threads, default 1
--use_gpu USE_GPU whether to use GPU devices (default: False)
-c CLASS_NUM, --class_num CLASS_NUM
number of categories for classification task.
--model_output_prefix MODEL_OUTPUT_PREFIX
prefix of the path for model to store, (default: ./)
-g NUM_BATCHES_TO_LOG, --num_batches_to_log NUM_BATCHES_TO_LOG
number of batches to output train log, (default: 100)
-e NUM_BATCHES_TO_TEST, --num_batches_to_test NUM_BATCHES_TO_TEST
number of batches to test, (default: 200)
-z NUM_BATCHES_TO_SAVE_MODEL, --num_batches_to_save_model NUM_BATCHES_TO_SAVE_MODEL
number of batches to output model, (default: 400)
```
Parameter description:
- `train_data_path` Training data path - `train_data_path` Training data path
- `test_data_path` Test data path, optional - `test_data_path` Test data path, optional
...@@ -460,48 +293,8 @@ Parameter description: ...@@ -460,48 +293,8 @@ Parameter description:
- `dnn_dims` The dimension of each layer of the model is set, the default is `256,128,64,32`,with 4 layers. - `dnn_dims` The dimension of each layer of the model is set, the default is `256,128,64,32`,with 4 layers.
## To predict using the trained model ## To predict using the trained model
```
usage: infer.py [-h] --model_path MODEL_PATH -i DATA_PATH -o
PREDICTION_OUTPUT_PATH -y MODEL_TYPE [-s SOURCE_DIC_PATH]
[--target_dic_path TARGET_DIC_PATH] -a MODEL_ARCH
[--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
[--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
[-c CLASS_NUM]
PaddlePaddle DSSM infer
optional arguments:
-h, --help show this help message and exit
--model_path MODEL_PATH
path of model parameters file
-i DATA_PATH, --data_path DATA_PATH
path of the dataset to infer
-o PREDICTION_OUTPUT_PATH, --prediction_output_path PREDICTION_OUTPUT_PATH
path to output the prediction
-y MODEL_TYPE, --model_type MODEL_TYPE
model type, 0 for classification, 1 for pairwise rank,
2 for regression (default: classification)
-s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
path of the source's word dic
--target_dic_path TARGET_DIC_PATH
path of the target's word dic, if not set, the
`source_dic_path` will be used
-a MODEL_ARCH, --model_arch MODEL_ARCH
model architecture, 1 for CNN, 0 for FC, 2 for RNN
--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
whether to share network parameters between source and
target
--share_embed SHARE_EMBED
whether to share word embedding between source and
target
--dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32',
which means create a 4-layer dnn, demention of each
layer is 256, 128, 64 and 32
-c CLASS_NUM, --class_num CLASS_NUM
number of categories for classification task.
```
Important parameters are The paremeters to execute the script `infer.py` can be found by execution `python infer.py --help`. Some important parameters are:
- `data_path` Path for the data to predict - `data_path` Path for the data to predict
- `prediction_output_path` Prediction output path - `prediction_output_path` Prediction output path
......
...@@ -9,83 +9,81 @@ from utils import logger, ModelType, ModelArch, load_dic ...@@ -9,83 +9,81 @@ from utils import logger, ModelType, ModelArch, load_dic
parser = argparse.ArgumentParser(description="PaddlePaddle DSSM infer") parser = argparse.ArgumentParser(description="PaddlePaddle DSSM infer")
parser.add_argument( parser.add_argument(
'--model_path', "--model_path", type=str, required=True, help="The path of trained model.")
type=str,
required=True,
help="path of model parameters file")
parser.add_argument( parser.add_argument(
'-i', "-i",
'--data_path', "--data_path",
type=str, type=str,
required=True, required=True,
help="path of the dataset to infer") help="The path of the data for inferring.")
parser.add_argument( parser.add_argument(
'-o', "-o",
'--prediction_output_path', "--prediction_output_path",
type=str, type=str,
required=True, required=True,
help="path to output the prediction") help="The path to save the predictions.")
parser.add_argument( parser.add_argument(
'-y', "-y",
'--model_type', "--model_type",
type=int, type=int,
required=True, required=True,
default=ModelType.CLASSIFICATION_MODE, default=ModelType.CLASSIFICATION_MODE,
help=("model type, %d for classification, %d for pairwise rank, " help=("The model type: %d for classification, %d for pairwise rank, "
"%d for regression (default: classification)") % "%d for regression (default: classification).") %
(ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE, (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
ModelType.REGRESSION_MODE)) ModelType.REGRESSION_MODE))
parser.add_argument( parser.add_argument(
'-s', "-s",
'--source_dic_path', "--source_dic_path",
type=str, type=str,
required=False, required=False,
help="path of the source's word dic") help="The path of the source's word dictionary.")
parser.add_argument( parser.add_argument(
'--target_dic_path', "--target_dic_path",
type=str, type=str,
required=False, required=False,
help=("path of the target's word dictionary, " help=("The path of the target's word dictionary, "
"if not set, the `source_dic_path` will be used")) "if this parameter is not set, the `source_dic_path` will be used."))
parser.add_argument( parser.add_argument(
'-a', "-a",
'--model_arch', "--model_arch",
type=int, type=int,
required=True, required=True,
default=ModelArch.CNN_MODE, default=ModelArch.CNN_MODE,
help="model architecture, %d for CNN, %d for FC, %d for RNN" % help="model architecture, %d for CNN, %d for FC, %d for RNN" %
(ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE)) (ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE))
parser.add_argument( parser.add_argument(
'--share_network_between_source_target', "--share_network_between_source_target",
type=distutils.util.strtobool, type=distutils.util.strtobool,
default=False, default=False,
help="whether to share network parameters between source and target") help="whether to share network parameters between source and target")
parser.add_argument( parser.add_argument(
'--share_embed', "--share_embed",
type=distutils.util.strtobool, type=distutils.util.strtobool,
default=False, default=False,
help="whether to share word embedding between source and target") help="whether to share word embedding between source and target")
parser.add_argument( parser.add_argument(
'--dnn_dims', "--dnn_dims",
type=str, type=str,
default='256,128,64,32', default="256,128,64,32",
help=("dimentions of dnn layers, default is '256,128,64,32', " help=("The dimentions of dnn layers, default is `256,128,64,32`, "
"which means create a 4-layer dnn, " "which means a dnn with 4 layers with "
"demention of each layer is 256, 128, 64 and 32")) "dmentions 256, 128, 64 and 32 will be created."))
parser.add_argument( parser.add_argument(
'-c', "-c",
'--class_num', "--class_num",
type=int, type=int,
default=0, default=0,
help="number of categories for classification task.") help="The number of categories for classification task.")
args = parser.parse_args() args = parser.parse_args()
args.model_type = ModelType(args.model_type) args.model_type = ModelType(args.model_type)
args.model_arch = ModelArch(args.model_arch) args.model_arch = ModelArch(args.model_arch)
if args.model_type.is_classification(): if args.model_type.is_classification():
assert args.class_num > 1, "--class_num should be set in classification task." assert args.class_num > 1, ("The parameter class_num should be set "
"in classification task.")
layer_dims = map(int, args.dnn_dims.split(',')) layer_dims = map(int, args.dnn_dims.split(","))
args.target_dic_path = args.source_dic_path if not args.target_dic_path \ args.target_dic_path = args.source_dic_path if not args.target_dic_path \
else args.target_dic_path else args.target_dic_path
...@@ -94,8 +92,6 @@ paddle.init(use_gpu=False, trainer_count=1) ...@@ -94,8 +92,6 @@ paddle.init(use_gpu=False, trainer_count=1)
class Inferer(object): class Inferer(object):
def __init__(self, param_path): def __init__(self, param_path):
logger.info("create DSSM model")
prediction = DSSM( prediction = DSSM(
dnn_dims=layer_dims, dnn_dims=layer_dims,
vocab_sizes=[ vocab_sizes=[
...@@ -110,14 +106,13 @@ class Inferer(object): ...@@ -110,14 +106,13 @@ class Inferer(object):
is_infer=True)() is_infer=True)()
# load parameter # load parameter
logger.info("load model parameters from %s" % param_path) logger.info("Load the trained model from %s." % param_path)
self.parameters = paddle.parameters.Parameters.from_tar( self.parameters = paddle.parameters.Parameters.from_tar(
open(param_path, 'r')) open(param_path, "r"))
self.inferer = paddle.inference.Inference( self.inferer = paddle.inference.Inference(
output_layer=prediction, parameters=self.parameters) output_layer=prediction, parameters=self.parameters)
def infer(self, data_path): def infer(self, data_path):
logger.info("infer data...")
dataset = reader.Dataset( dataset = reader.Dataset(
train_path=data_path, train_path=data_path,
test_path=None, test_path=None,
...@@ -125,19 +120,20 @@ class Inferer(object): ...@@ -125,19 +120,20 @@ class Inferer(object):
target_dic_path=args.target_dic_path, target_dic_path=args.target_dic_path,
model_type=args.model_type, ) model_type=args.model_type, )
infer_reader = paddle.batch(dataset.infer, batch_size=1000) infer_reader = paddle.batch(dataset.infer, batch_size=1000)
logger.warning('write predictions to %s' % args.prediction_output_path) logger.warning("Write predictions to %s." % args.prediction_output_path)
output_f = open(args.prediction_output_path, 'w') output_f = open(args.prediction_output_path, "w")
for id, batch in enumerate(infer_reader()): for id, batch in enumerate(infer_reader()):
res = self.inferer.infer(input=batch) res = self.inferer.infer(input=batch)
predictions = [' '.join(map(str, x)) for x in res] predictions = [" ".join(map(str, x)) for x in res]
assert len(batch) == len(predictions), ( assert len(batch) == len(predictions), (
"predict error, %d inputs, " "Error! %d inputs are given, "
"but %d predictions") % (len(batch), len(predictions)) "but only %d predictions are returned.") % (len(batch),
output_f.write('\n'.join(map(str, predictions)) + '\n') len(predictions))
output_f.write("\n".join(map(str, predictions)) + "\n")
if __name__ == '__main__': if __name__ == "__main__":
inferer = Inferer(args.model_path) inferer = Inferer(args.model_path)
inferer.infer(args.data_path) inferer.infer(args.data_path)
...@@ -13,26 +13,33 @@ class DSSM(object): ...@@ -13,26 +13,33 @@ class DSSM(object):
class_num=None, class_num=None,
share_embed=False, share_embed=False,
is_infer=False): is_infer=False):
''' """
@dnn_dims: list of int :param dnn_dims: The dimention of each layer in the semantic vector
dimentions of each layer in semantic vector generator. generator.
@vocab_sizes: 2-d tuple :type dnn_dims: list of int
size of both left and right items. :param vocab_sizes: The size of left and right items.
@model_type: int :type vocab_sizes: A list having 2 elements.
type of task, should be 'rank: 0', 'regression: 1' or 'classification: 2' :param model_type: The type of task to train the DSSM model. The value
@model_arch: int should be "rank: 0", "regression: 1" or
model architecture "classification: 2".
@share_semantic_generator: bool :type model_type: int
whether to share the semantic vector generator for both left and right. :param model_arch: A value indicating the model architecture to use.
@share_embed: bool :type model_arch: int
whether to share the embeddings between left and right. :param share_semantic_generator: A flag indicating whether to share the
@class_num: int semantic vector between the left and
number of categories. the right item.
''' :type share_semantic_generator: bool
:param share_embed: A floag indicating whether to share the embeddings
between the left and the right item.
:type share_embed: bool
:param class_num: The number of categories.
:type class_num: int
"""
assert len(vocab_sizes) == 2, ( assert len(vocab_sizes) == 2, (
"vocab_sizes specify the sizes left and right inputs, " "The vocab_sizes specifying the sizes left and right inputs. "
"and dim should be 2.") "Its dimension should be 2.")
assert len(dnn_dims) > 1, "more than two layers is needed." assert len(dnn_dims) > 1, ("In the DNN model, more than two layers "
"are needed.")
self.dnn_dims = dnn_dims self.dnn_dims = dnn_dims
self.vocab_sizes = vocab_sizes self.vocab_sizes = vocab_sizes
...@@ -42,91 +49,89 @@ class DSSM(object): ...@@ -42,91 +49,89 @@ class DSSM(object):
self.model_arch = ModelArch(model_arch) self.model_arch = ModelArch(model_arch)
self.class_num = class_num self.class_num = class_num
self.is_infer = is_infer self.is_infer = is_infer
logger.warning("build DSSM model with config of %s, %s" % logger.warning("Build DSSM model with config of %s, %s" %
(self.model_type, self.model_arch)) (self.model_type, self.model_arch))
logger.info("vocabulary sizes: %s" % str(self.vocab_sizes)) logger.info("The vocabulary size is : %s" % str(self.vocab_sizes))
# bind model architecture # bind model architecture
_model_arch = { _model_arch = {
'cnn': self.create_cnn, "cnn": self.create_cnn,
'fc': self.create_fc, "fc": self.create_fc,
'rnn': self.create_rnn, "rnn": self.create_rnn,
} }
def _model_arch_creater(emb, prefix=''): def _model_arch_creater(emb, prefix=""):
sent_vec = _model_arch.get(str(model_arch))(emb, prefix) sent_vec = _model_arch.get(str(model_arch))(emb, prefix)
dnn = self.create_dnn(sent_vec, prefix) dnn = self.create_dnn(sent_vec, prefix)
return dnn return dnn
self.model_arch_creater = _model_arch_creater self.model_arch_creater = _model_arch_creater
# build model type
_model_type = { _model_type = {
'classification': self._build_classification_model, "classification": self._build_classification_model,
'rank': self._build_rank_model, "rank": self._build_rank_model,
'regression': self._build_regression_model, "regression": self._build_regression_model,
} }
print 'model type: ', str(self.model_type) print("model type: ", str(self.model_type))
self.model_type_creater = _model_type[str(self.model_type)] self.model_type_creater = _model_type[str(self.model_type)]
def __call__(self): def __call__(self):
return self.model_type_creater() return self.model_type_creater()
def create_embedding(self, input, prefix=''): def create_embedding(self, input, prefix=""):
''' """
Create an embedding table whose name has a `prefix`. Create word embedding. The `prefix` is added in front of the name of
''' embedding"s learnable parameter.
logger.info("create embedding table [%s] which dimention is %d" % """
logger.info("Create embedding table [%s] whose dimention is %d. " %
(prefix, self.dnn_dims[0])) (prefix, self.dnn_dims[0]))
emb = paddle.layer.embedding( emb = paddle.layer.embedding(
input=input, input=input,
size=self.dnn_dims[0], size=self.dnn_dims[0],
param_attr=ParamAttr(name='%s_emb.w' % prefix)) param_attr=ParamAttr(name="%s_emb.w" % prefix))
return emb return emb
def create_fc(self, emb, prefix=''): def create_fc(self, emb, prefix=""):
''' """
A multi-layer fully connected neural networks. A multi-layer fully connected neural networks.
@emb: paddle.layer :param emb: The output of the embedding layer
output of the embedding layer :type emb: paddle.layer
@prefix: str :param prefix: A prefix will be added to the layers' names.
prefix of layers' names, used to share parameters between :type prefix: str
more than one `fc` parts. """
'''
_input_layer = paddle.layer.pooling( _input_layer = paddle.layer.pooling(
input=emb, pooling_type=paddle.pooling.Max()) input=emb, pooling_type=paddle.pooling.Max())
fc = paddle.layer.fc( fc = paddle.layer.fc(
input=_input_layer, input=_input_layer,
size=self.dnn_dims[1], size=self.dnn_dims[1],
param_attr=ParamAttr(name='%s_fc.w' % prefix), param_attr=ParamAttr(name="%s_fc.w" % prefix),
bias_attr=ParamAttr(name="%s_fc.b" % prefix, initial_std=0.)) bias_attr=ParamAttr(name="%s_fc.b" % prefix, initial_std=0.))
return fc return fc
def create_rnn(self, emb, prefix=''): def create_rnn(self, emb, prefix=""):
''' """
A GRU sentence vector learner. A GRU sentence vector learner.
''' """
gru = paddle.networks.simple_gru( gru = paddle.networks.simple_gru(
input=emb, input=emb,
size=self.dnn_dims[1], size=self.dnn_dims[1],
mixed_param_attr=ParamAttr(name='%s_gru_mixed.w' % prefix), mixed_param_attr=ParamAttr(name="%s_gru_mixed.w" % prefix),
mixed_bias_param_attr=ParamAttr(name="%s_gru_mixed.b" % prefix), mixed_bias_param_attr=ParamAttr(name="%s_gru_mixed.b" % prefix),
gru_param_attr=ParamAttr(name='%s_gru.w' % prefix), gru_param_attr=ParamAttr(name="%s_gru.w" % prefix),
gru_bias_attr=ParamAttr(name="%s_gru.b" % prefix)) gru_bias_attr=ParamAttr(name="%s_gru.b" % prefix))
sent_vec = paddle.layer.last_seq(gru) sent_vec = paddle.layer.last_seq(gru)
return sent_vec return sent_vec
def create_cnn(self, emb, prefix=''): def create_cnn(self, emb, prefix=""):
''' """
A multi-layer CNN. A multi-layer CNN.
@emb: paddle.layer :param emb: The word embedding.
output of the embedding layer :type emb: paddle.layer
@prefix: str :param prefix: The prefix will be added to of layers' names.
prefix of layers' names, used to share parameters between :type prefix: str
more than one `cnn` parts. """
'''
def create_conv(context_len, hidden_size, prefix): def create_conv(context_len, hidden_size, prefix):
key = "%s_%d_%d" % (prefix, context_len, hidden_size) key = "%s_%d_%d" % (prefix, context_len, hidden_size)
...@@ -135,15 +140,15 @@ class DSSM(object): ...@@ -135,15 +140,15 @@ class DSSM(object):
context_len=context_len, context_len=context_len,
hidden_size=hidden_size, hidden_size=hidden_size,
# set parameter attr for parameter sharing # set parameter attr for parameter sharing
context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'), context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"),
fc_param_attr=ParamAttr(name=key + '_fc.w'), fc_param_attr=ParamAttr(name=key + "_fc.w"),
fc_bias_attr=ParamAttr(name=key + '_fc.b'), fc_bias_attr=ParamAttr(name=key + "_fc.b"),
pool_bias_attr=ParamAttr(name=key + '_pool.b')) pool_bias_attr=ParamAttr(name=key + "_pool.b"))
return conv return conv
logger.info('create a sequence_conv_pool which context width is 3') logger.info("create a sequence_conv_pool which context width is 3")
conv_3 = create_conv(3, self.dnn_dims[1], "cnn") conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
logger.info('create a sequence_conv_pool which context width is 4') logger.info("create a sequence_conv_pool which context width is 4")
conv_4 = create_conv(4, self.dnn_dims[1], "cnn") conv_4 = create_conv(4, self.dnn_dims[1], "cnn")
return conv_3, conv_4 return conv_3, conv_4
...@@ -160,8 +165,8 @@ class DSSM(object): ...@@ -160,8 +165,8 @@ class DSSM(object):
input=_input_layer, input=_input_layer,
size=dim, size=dim,
act=paddle.activation.Tanh(), act=paddle.activation.Tanh(),
param_attr=ParamAttr(name='%s.w' % name), param_attr=ParamAttr(name="%s.w" % name),
bias_attr=ParamAttr(name='%s.b' % name, initial_std=0.)) bias_attr=ParamAttr(name="%s.b" % name, initial_std=0.))
_input_layer = fc _input_layer = fc
return _input_layer return _input_layer
...@@ -178,7 +183,7 @@ class DSSM(object): ...@@ -178,7 +183,7 @@ class DSSM(object):
is_classification=False) is_classification=False)
def _build_rank_model(self): def _build_rank_model(self):
''' """
Build a pairwise rank model, and the cost is returned. Build a pairwise rank model, and the cost is returned.
A pairwise rank model has 3 inputs: A pairwise rank model has 3 inputs:
...@@ -187,26 +192,26 @@ class DSSM(object): ...@@ -187,26 +192,26 @@ class DSSM(object):
- right_target sentence - right_target sentence
- label, 1 if left_target should be sorted in front of - label, 1 if left_target should be sorted in front of
right_target, otherwise 0. right_target, otherwise 0.
''' """
logger.info("build rank model") logger.info("build rank model")
assert self.model_type.is_rank() assert self.model_type.is_rank()
source = paddle.layer.data( source = paddle.layer.data(
name='source_input', name="source_input",
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0])) type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
left_target = paddle.layer.data( left_target = paddle.layer.data(
name='left_target_input', name="left_target_input",
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
right_target = paddle.layer.data( right_target = paddle.layer.data(
name='right_target_input', name="right_target_input",
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
if not self.is_infer: if not self.is_infer:
label = paddle.layer.data( label = paddle.layer.data(
name='label_input', type=paddle.data_type.integer_value(1)) name="label_input", type=paddle.data_type.integer_value(1))
prefixs = '_ _ _'.split( prefixs = "_ _ _".split(
) if self.share_semantic_generator else 'source target target'.split() ) if self.share_semantic_generator else "source target target".split()
embed_prefixs = '_ _'.split( embed_prefixs = "_ _".split(
) if self.share_embed else 'source target target'.split() ) if self.share_embed else "source target target".split()
word_vecs = [] word_vecs = []
for id, input in enumerate([source, left_target, right_target]): for id, input in enumerate([source, left_target, right_target]):
...@@ -218,9 +223,9 @@ class DSSM(object): ...@@ -218,9 +223,9 @@ class DSSM(object):
x = self.model_arch_creater(input, prefix=prefixs[id]) x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x) semantics.append(x)
# cossim score of source and left_target # The cosine similarity score of source and left_target.
left_score = paddle.layer.cos_sim(semantics[0], semantics[1]) left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
# cossim score of source and right target # The cosine similarity score of source and right target.
right_score = paddle.layer.cos_sim(semantics[0], semantics[2]) right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
if not self.is_infer: if not self.is_infer:
...@@ -233,34 +238,33 @@ class DSSM(object): ...@@ -233,34 +238,33 @@ class DSSM(object):
return right_score return right_score
def _build_classification_or_regression_model(self, is_classification): def _build_classification_or_regression_model(self, is_classification):
''' """
Build a classification/regression model, and the cost is returned. Build a classification/regression model, and the cost is returned.
A Classification has 3 inputs: The classification/regression task expects 3 inputs:
- source sentence - source sentence
- target sentence - target sentence
- classification label - classification label
''' """
if is_classification: if is_classification:
# prepare inputs.
assert self.class_num assert self.class_num
source = paddle.layer.data( source = paddle.layer.data(
name='source_input', name="source_input",
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0])) type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
target = paddle.layer.data( target = paddle.layer.data(
name='target_input', name="target_input",
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data( label = paddle.layer.data(
name='label_input', name="label_input",
type=paddle.data_type.integer_value(self.class_num) type=paddle.data_type.integer_value(self.class_num)
if is_classification else paddle.data_type.dense_vector(1)) if is_classification else paddle.data_type.dense_vector(1))
prefixs = '_ _'.split( prefixs = "_ _".split(
) if self.share_semantic_generator else 'source target'.split() ) if self.share_semantic_generator else "source target".split()
embed_prefixs = '_ _'.split( embed_prefixs = "_ _".split(
) if self.share_embed else 'source target'.split() ) if self.share_embed else "source target".split()
word_vecs = [] word_vecs = []
for id, input in enumerate([source, target]): for id, input in enumerate([source, target]):
......
...@@ -9,120 +9,129 @@ from utils import TaskType, load_dic, logger, ModelType, ModelArch, display_args ...@@ -9,120 +9,129 @@ from utils import TaskType, load_dic, logger, ModelType, ModelArch, display_args
parser = argparse.ArgumentParser(description="PaddlePaddle DSSM example") parser = argparse.ArgumentParser(description="PaddlePaddle DSSM example")
parser.add_argument( parser.add_argument(
'-i', "-i",
'--train_data_path', "--train_data_path",
type=str, type=str,
required=False, required=False,
help="path of training dataset") help="The path of training data.")
parser.add_argument( parser.add_argument(
'-t', "-t",
'--test_data_path', "--test_data_path",
type=str, type=str,
required=False, required=False,
help="path of testing dataset") help="The path of testing data.")
parser.add_argument( parser.add_argument(
'-s', "-s",
'--source_dic_path', "--source_dic_path",
type=str, type=str,
required=False, required=False,
help="path of the source's word dic") help="The path of the source's word dictionary.")
parser.add_argument( parser.add_argument(
'--target_dic_path', "--target_dic_path",
type=str, type=str,
required=False, required=False,
help=("path of the target's word dictionary, " help=("The path of the target's word dictionary, "
"if not set, the `source_dic_path` will be used")) "if this parameter is not set, the `source_dic_path` will be used"))
parser.add_argument( parser.add_argument(
'-b', "-b",
'--batch_size', "--batch_size",
type=int, type=int,
default=32, default=32,
help="size of mini-batch (default:32)") help="The size of mini-batch (default:32).")
parser.add_argument( parser.add_argument(
'-p', "-p",
'--num_passes', "--num_passes",
type=int, type=int,
default=10, default=10,
help="number of passes to run(default:10)") help="The number of passes to run(default:10).")
parser.add_argument( parser.add_argument(
'-y', "-y",
'--model_type', "--model_type",
type=int, type=int,
required=True, required=True,
default=ModelType.CLASSIFICATION_MODE, default=ModelType.CLASSIFICATION_MODE,
help="model type, %d for classification, %d for pairwise rank, %d for regression (default: classification)" help=("model type, %d for classification, %d for pairwise rank, "
% (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE, "%d for regression (default: classification).") %
ModelType.REGRESSION_MODE)) (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
ModelType.REGRESSION_MODE))
parser.add_argument( parser.add_argument(
'-a', "-a",
'--model_arch', "--model_arch",
type=int, type=int,
required=True, required=True,
default=ModelArch.CNN_MODE, default=ModelArch.CNN_MODE,
help="model architecture, %d for CNN, %d for FC, %d for RNN" % help="The model architecture, %d for CNN, %d for FC, %d for RNN." %
(ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE)) (ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE))
parser.add_argument( parser.add_argument(
'--share_network_between_source_target', "--share_network_between_source_target",
type=distutils.util.strtobool, type=distutils.util.strtobool,
default=False, default=False,
help="whether to share network parameters between source and target") help="Whether to share network parameters between source and target.")
parser.add_argument( parser.add_argument(
'--share_embed', "--share_embed",
type=distutils.util.strtobool, type=distutils.util.strtobool,
default=False, default=False,
help="whether to share word embedding between source and target") help="Whether to share word embedding between source and target.")
parser.add_argument( parser.add_argument(
'--dnn_dims', "--dnn_dims",
type=str, type=str,
default='256,128,64,32', default="256,128,64,32",
help="dimentions of dnn layers, default is '256,128,64,32', which means create a 4-layer dnn, demention of each layer is 256, 128, 64 and 32" help=("The dimentions of dnn layers, default is '256,128,64,32', "
) "which means create a 4-layer dnn. The dimention of each layer is "
"'256, 128, 64 and 32'."))
parser.add_argument( parser.add_argument(
'--num_workers', type=int, default=1, help="num worker threads, default 1") "--num_workers",
type=int,
default=1,
help="The number of worker threads, default 1.")
parser.add_argument( parser.add_argument(
'--use_gpu', "--use_gpu",
type=distutils.util.strtobool, type=distutils.util.strtobool,
default=False, default=False,
help="whether to use GPU devices (default: False)") help="Whether to use GPU devices (default: False)")
parser.add_argument( parser.add_argument(
'-c', "-c",
'--class_num', "--class_num",
type=int, type=int,
default=0, default=0,
help="number of categories for classification task.") help="The number of categories for classification task.")
parser.add_argument( parser.add_argument(
'--model_output_prefix', "--model_output_prefix",
type=str, type=str,
default="./", default="./",
help="prefix of the path for model to store, (default: ./)") help="The prefix of the path to store the trained models (default: ./).")
parser.add_argument( parser.add_argument(
'-g', "-g",
'--num_batches_to_log', "--num_batches_to_log",
type=int, type=int,
default=100, default=100,
help="number of batches to output train log, (default: 100)") help=("The log period. Every num_batches_to_test batches, "
"a training log will be printed. (default: 100)"))
parser.add_argument( parser.add_argument(
'-e', "-e",
'--num_batches_to_test', "--num_batches_to_test",
type=int, type=int,
default=200, default=200,
help="number of batches to test, (default: 200)") help=("The test period. Every num_batches_to_save_model batches, "
"the specified test sample will be test (default: 200)."))
parser.add_argument( parser.add_argument(
'-z', "-z",
'--num_batches_to_save_model', "--num_batches_to_save_model",
type=int, type=int,
default=400, default=400,
help="number of batches to output model, (default: 400)") help=("Every num_batches_to_save_model batches, "
"a trained model will be saved (default: 400)."))
# arguments check.
args = parser.parse_args() args = parser.parse_args()
args.model_type = ModelType(args.model_type) args.model_type = ModelType(args.model_type)
args.model_arch = ModelArch(args.model_arch) args.model_arch = ModelArch(args.model_arch)
if args.model_type.is_classification(): if args.model_type.is_classification():
assert args.class_num > 1, "--class_num should be set in classification task." assert args.class_num > 1, ("The parameter class_num should be set in "
"classification task.")
layer_dims = [int(i) for i in args.dnn_dims.split(',')] layer_dims = [int(i) for i in args.dnn_dims.split(",")]
args.target_dic_path = args.source_dic_path if not args.target_dic_path else args.target_dic_path args.target_dic_path = args.source_dic_path if not \
args.target_dic_path else args.target_dic_path
def train(train_data_path=None, def train(train_data_path=None,
...@@ -138,15 +147,15 @@ def train(train_data_path=None, ...@@ -138,15 +147,15 @@ def train(train_data_path=None,
class_num=None, class_num=None,
num_workers=1, num_workers=1,
use_gpu=False): use_gpu=False):
''' """
Train the DSSM. Train the DSSM.
''' """
default_train_path = './data/rank/train.txt' default_train_path = "./data/rank/train.txt"
default_test_path = './data/rank/test.txt' default_test_path = "./data/rank/test.txt"
default_dic_path = './data/vocab.txt' default_dic_path = "./data/vocab.txt"
if not model_type.is_rank(): if not model_type.is_rank():
default_train_path = './data/classification/train.txt' default_train_path = "./data/classification/train.txt"
default_test_path = './data/classification/test.txt' default_test_path = "./data/classification/test.txt"
use_default_data = not train_data_path use_default_data = not train_data_path
...@@ -200,19 +209,19 @@ def train(train_data_path=None, ...@@ -200,19 +209,19 @@ def train(train_data_path=None,
feeding = {} feeding = {}
if model_type.is_classification() or model_type.is_regression(): if model_type.is_classification() or model_type.is_regression():
feeding = {'source_input': 0, 'target_input': 1, 'label_input': 2} feeding = {"source_input": 0, "target_input": 1, "label_input": 2}
else: else:
feeding = { feeding = {
'source_input': 0, "source_input": 0,
'left_target_input': 1, "left_target_input": 1,
'right_target_input': 2, "right_target_input": 2,
'label_input': 3 "label_input": 3
} }
def _event_handler(event): def _event_handler(event):
''' """
Define batch handler Define batch handler
''' """
if isinstance(event, paddle.event.EndIteration): if isinstance(event, paddle.event.EndIteration):
# output train log # output train log
if event.batch_id % args.num_batches_to_log == 0: if event.batch_id % args.num_batches_to_log == 0:
...@@ -249,7 +258,7 @@ def train(train_data_path=None, ...@@ -249,7 +258,7 @@ def train(train_data_path=None,
logger.info("Training has finished.") logger.info("Training has finished.")
if __name__ == '__main__': if __name__ == "__main__":
display_args(args) display_args(args)
train( train(
train_data_path=args.train_data_path, train_data_path=args.train_data_path,
......
...@@ -8,7 +8,7 @@ logger.setLevel(logging.INFO) ...@@ -8,7 +8,7 @@ logger.setLevel(logging.INFO)
def mode_attr_name(mode): def mode_attr_name(mode):
return mode.upper() + '_MODE' return mode.upper() + "_MODE"
def create_attrs(cls): def create_attrs(cls):
...@@ -17,9 +17,9 @@ def create_attrs(cls): ...@@ -17,9 +17,9 @@ def create_attrs(cls):
def make_check_method(cls): def make_check_method(cls):
''' """
create methods for classes. create methods for classes.
''' """
def method(mode): def method(mode):
def _method(self): def _method(self):
...@@ -28,7 +28,7 @@ def make_check_method(cls): ...@@ -28,7 +28,7 @@ def make_check_method(cls):
return _method return _method
for id, mode in enumerate(cls.modes): for id, mode in enumerate(cls.modes):
setattr(cls, 'is_' + mode, method(mode)) setattr(cls, "is_" + mode, method(mode))
def make_create_method(cls): def make_create_method(cls):
...@@ -41,10 +41,10 @@ def make_create_method(cls): ...@@ -41,10 +41,10 @@ def make_create_method(cls):
return _method return _method
for id, mode in enumerate(cls.modes): for id, mode in enumerate(cls.modes):
setattr(cls, 'create_' + mode, method(mode)) setattr(cls, "create_" + mode, method(mode))
def make_str_method(cls, type_name='unk'): def make_str_method(cls, type_name="unk"):
def _str_(self): def _str_(self):
for mode in cls.modes: for mode in cls.modes:
if self.mode == getattr(cls, mode_attr_name(mode)): if self.mode == getattr(cls, mode_attr_name(mode)):
...@@ -53,9 +53,9 @@ def make_str_method(cls, type_name='unk'): ...@@ -53,9 +53,9 @@ def make_str_method(cls, type_name='unk'):
def _hash_(self): def _hash_(self):
return self.mode return self.mode
setattr(cls, '__str__', _str_) setattr(cls, "__str__", _str_)
setattr(cls, '__repr__', _str_) setattr(cls, "__repr__", _str_)
setattr(cls, '__hash__', _hash_) setattr(cls, "__hash__", _hash_)
cls.__name__ = type_name cls.__name__ = type_name
...@@ -65,7 +65,7 @@ def _init_(self, mode, cls): ...@@ -65,7 +65,7 @@ def _init_(self, mode, cls):
elif isinstance(mode, cls): elif isinstance(mode, cls):
self.mode = mode.mode self.mode = mode.mode
else: else:
raise Exception("wrong mode type, get type: %s, value: %s" % raise Exception("A wrong mode type, get type: %s, value: %s." %
(type(mode), mode)) (type(mode), mode))
...@@ -77,21 +77,21 @@ def build_mode_class(cls): ...@@ -77,21 +77,21 @@ def build_mode_class(cls):
class TaskType(object): class TaskType(object):
modes = 'train test infer'.split() modes = "train test infer".split()
def __init__(self, mode): def __init__(self, mode):
_init_(self, mode, TaskType) _init_(self, mode, TaskType)
class ModelType: class ModelType:
modes = 'classification rank regression'.split() modes = "classification rank regression".split()
def __init__(self, mode): def __init__(self, mode):
_init_(self, mode, ModelType) _init_(self, mode, ModelType)
class ModelArch: class ModelArch:
modes = 'fc cnn rnn'.split() modes = "fc cnn rnn".split()
def __init__(self, mode): def __init__(self, mode):
_init_(self, mode, ModelArch) _init_(self, mode, ModelArch)
...@@ -103,22 +103,16 @@ build_mode_class(ModelArch) ...@@ -103,22 +103,16 @@ build_mode_class(ModelArch)
def sent2ids(sent, vocab): def sent2ids(sent, vocab):
''' """
transform a sentence to a list of ids. transform a sentence to a list of ids.
"""
@sent: str
a sentence.
@vocab: dict
a word dic
'''
return [vocab.get(w, UNK) for w in sent.split()] return [vocab.get(w, UNK) for w in sent.split()]
def load_dic(path): def load_dic(path):
''' """
word dic format: The format of word dictionary : each line is a word.
each line is a word """
'''
dic = {} dic = {}
with open(path) as f: with open(path) as f:
for id, line in enumerate(f): for id, line in enumerate(f):
...@@ -128,13 +122,6 @@ def load_dic(path): ...@@ -128,13 +122,6 @@ def load_dic(path):
def display_args(args): def display_args(args):
logger.info("arguments passed by command line:") logger.info("The arguments passed by command line is :")
for k, v in sorted(v for v in vars(args).items()): for k, v in sorted(v for v in vars(args).items()):
logger.info("{}:\t{}".format(k, v)) logger.info("{}:\t{}".format(k, v))
if __name__ == '__main__':
t = TaskType(1)
t = TaskType.create_train()
print t
print 'is', t.is_train()
...@@ -33,7 +33,6 @@ def ngram_lm(hidden_size, emb_size, dict_size, gram_num=4, is_train=True): ...@@ -33,7 +33,6 @@ def ngram_lm(hidden_size, emb_size, dict_size, gram_num=4, is_train=True):
num_classes=dict_size, num_classes=dict_size,
param_attr=paddle.attr.Param(name="nce_w"), param_attr=paddle.attr.Param(name="nce_w"),
bias_attr=paddle.attr.Param(name="nce_b"), bias_attr=paddle.attr.Param(name="nce_b"),
act=paddle.activation.Sigmoid(),
num_neg_samples=25, num_neg_samples=25,
neg_distribution=None) neg_distribution=None)
else: else:
...@@ -41,7 +40,7 @@ def ngram_lm(hidden_size, emb_size, dict_size, gram_num=4, is_train=True): ...@@ -41,7 +40,7 @@ def ngram_lm(hidden_size, emb_size, dict_size, gram_num=4, is_train=True):
size=dict_size, size=dict_size,
input=paddle.layer.trans_full_matrix_projection( input=paddle.layer.trans_full_matrix_projection(
hidden_layer, param_attr=paddle.attr.Param(name="nce_w")), hidden_layer, param_attr=paddle.attr.Param(name="nce_w")),
act=paddle.activation.Sigmoid(), act=paddle.activation.Softmax(),
bias_attr=paddle.attr.Param(name="nce_b")) bias_attr=paddle.attr.Param(name="nce_b"))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册