提交 adee3cf1 编写于 作者: C caoying03

refine comments of DSSM.

上级 f3b1bb5a
此差异已折叠。
...@@ -65,10 +65,11 @@ In below, we describe how to train DSSM model in PaddlePaddle. All the codes are ...@@ -65,10 +65,11 @@ In below, we describe how to train DSSM model in PaddlePaddle. All the codes are
### Create a word vector table for the text ### Create a word vector table for the text
```python ```python
def create_embedding(self, input, prefix=''): def create_embedding(self, input, prefix=''):
''' """
Create an embedding table whose name has a `prefix`. Create word embedding. The `prefix` is added in front of the name of
''' embedding"s learnable parameter.
logger.info("create embedding table [%s] which dimention is %d" % """
logger.info("Create embedding table [%s] whose dimention is %d" %
(prefix, self.dnn_dims[0])) (prefix, self.dnn_dims[0]))
emb = paddle.layer.embedding( emb = paddle.layer.embedding(
input=input, input=input,
...@@ -82,14 +83,15 @@ Since the input (embedding table) is a list of the IDs of the words correspondin ...@@ -82,14 +83,15 @@ Since the input (embedding table) is a list of the IDs of the words correspondin
### CNN implementation ### CNN implementation
```python ```python
def create_cnn(self, emb, prefix=''): def create_cnn(self, emb, prefix=''):
'''
"""
A multi-layer CNN. A multi-layer CNN.
:param emb: The word embedding.
:type emb: paddle.layer
:param prefix: The prefix will be added to of layers' names.
:type prefix: str
"""
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `cnn` parts.
'''
def create_conv(context_len, hidden_size, prefix): def create_conv(context_len, hidden_size, prefix):
key = "%s_%d_%d" % (prefix, context_len, hidden_size) key = "%s_%d_%d" % (prefix, context_len, hidden_size)
conv = paddle.networks.sequence_conv_pool( conv = paddle.networks.sequence_conv_pool(
...@@ -97,15 +99,13 @@ def create_cnn(self, emb, prefix=''): ...@@ -97,15 +99,13 @@ def create_cnn(self, emb, prefix=''):
context_len=context_len, context_len=context_len,
hidden_size=hidden_size, hidden_size=hidden_size,
# set parameter attr for parameter sharing # set parameter attr for parameter sharing
context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'), context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"),
fc_param_attr=ParamAttr(name=key + '_fc.w'), fc_param_attr=ParamAttr(name=key + "_fc.w"),
fc_bias_attr=ParamAttr(name=key + '_fc.b'), fc_bias_attr=ParamAttr(name=key + "_fc.b"),
pool_bias_attr=ParamAttr(name=key + '_pool.b')) pool_bias_attr=ParamAttr(name=key + "_pool.b"))
return conv return conv
logger.info('create a sequence_conv_pool which context width is 3')
conv_3 = create_conv(3, self.dnn_dims[1], "cnn") conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
logger.info('create a sequence_conv_pool which context width is 4')
conv_4 = create_conv(4, self.dnn_dims[1], "cnn") conv_4 = create_conv(4, self.dnn_dims[1], "cnn")
return conv_3, conv_4 return conv_3, conv_4
``` ```
...@@ -118,9 +118,9 @@ RNN is suitable for learning variable length of the information ...@@ -118,9 +118,9 @@ RNN is suitable for learning variable length of the information
```python ```python
def create_rnn(self, emb, prefix=''): def create_rnn(self, emb, prefix=''):
''' """
A GRU sentence vector learner. A GRU sentence vector learner.
''' """
gru = paddle.networks.simple_gru( gru = paddle.networks.simple_gru(
input=emb, input=emb,
size=self.dnn_dims[1], size=self.dnn_dims[1],
...@@ -136,14 +136,15 @@ def create_rnn(self, emb, prefix=''): ...@@ -136,14 +136,15 @@ def create_rnn(self, emb, prefix=''):
```python ```python
def create_fc(self, emb, prefix=''): def create_fc(self, emb, prefix=''):
'''
"""
A multi-layer fully connected neural networks. A multi-layer fully connected neural networks.
:param emb: The output of the embedding layer
:type emb: paddle.layer
:param prefix: A prefix will be added to the layers' names.
:type prefix: str
"""
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `fc` parts.
'''
_input_layer = paddle.layer.pooling( _input_layer = paddle.layer.pooling(
input=emb, pooling_type=paddle.pooling.Max()) input=emb, pooling_type=paddle.pooling.Max())
fc = paddle.layer.fc( fc = paddle.layer.fc(
...@@ -160,13 +161,10 @@ In the construction of FC, we use `paddle.layer.pooling` for the maximum pooling ...@@ -160,13 +161,10 @@ In the construction of FC, we use `paddle.layer.pooling` for the maximum pooling
```python ```python
def create_dnn(self, sent_vec, prefix): def create_dnn(self, sent_vec, prefix):
# if more than three layers exists, a fc layer will be added.
if len(self.dnn_dims) > 1: if len(self.dnn_dims) > 1:
_input_layer = sent_vec _input_layer = sent_vec
for id, dim in enumerate(self.dnn_dims[1:]): for id, dim in enumerate(self.dnn_dims[1:]):
name = "%s_fc_%d_%d" % (prefix, id, dim) name = "%s_fc_%d_%d" % (prefix, id, dim)
logger.info("create fc layer [%s] which dimention is %d" %
(name, dim))
fc = paddle.layer.fc( fc = paddle.layer.fc(
input=_input_layer, input=_input_layer,
size=dim, size=dim,
...@@ -180,117 +178,12 @@ def create_dnn(self, sent_vec, prefix): ...@@ -180,117 +178,12 @@ def create_dnn(self, sent_vec, prefix):
### Classification / Regression ### Classification / Regression
The structure of classification and regression is similar. Below function can be used for both tasks. The structure of classification and regression is similar. Below function can be used for both tasks.
Please check the function `_build_classification_or_regression_model` in [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) for detail implementation.
```python
def _build_classification_or_regression_model(self, is_classification):
'''
Build a classification/regression model, and the cost is returned.
A Classification has 3 inputs:
- source sentence
- target sentence
- classification label
'''
# prepare inputs.
assert self.class_num
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
target = paddle.layer.data(
name='target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input',
type=paddle.data_type.integer_value(self.class_num)
if is_classification else paddle.data_type.dense_input)
prefixs = '_ _'.split(
) if self.share_semantic_generator else 'source target'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target'.split()
word_vecs = []
for id, input in enumerate([source, target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
if is_classification:
concated_vector = paddle.layer.concat(semantics)
prediction = paddle.layer.fc(
input=concated_vector,
size=self.class_num,
act=paddle.activation.Softmax())
cost = paddle.layer.classification_cost(
input=prediction, label=label)
else:
prediction = paddle.layer.cos_sim(*semantics)
cost = paddle.layer.square_error_cost(prediction, label)
if not self.is_infer:
return cost, prediction, label
return prediction
```
### Pairwise Rank ### Pairwise Rank
Please check the function `_build_rank_model` in [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) for implementation.
```python
def _build_rank_model(self):
'''
Build a pairwise rank model, and the cost is returned.
A pairwise rank model has 3 inputs:
- source sentence
- left_target sentence
- right_target sentence
- label, 1 if left_target should be sorted in front of right_target, otherwise 0.
'''
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
left_target = paddle.layer.data(
name='left_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
right_target = paddle.layer.data(
name='right_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input', type=paddle.data_type.integer_value(1))
prefixs = '_ _ _'.split(
) if self.share_semantic_generator else 'source target target'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target target'.split()
word_vecs = []
for id, input in enumerate([source, left_target, right_target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
# cossim score of source and left_target
left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
# cossim score of source and right target
right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
# rank cost
cost = paddle.layer.rank_cost(left_score, right_score, label=label)
# prediction = left_score - right_score
# but this operator is not supported currently.
# so AUC will not used.
return cost, None, None
```
## Data Format ## Data Format
Below is a simple example for the data in `./data` Below is a simple example for the data in `./data`
...@@ -347,67 +240,7 @@ The example of this format is as follows. ...@@ -347,67 +240,7 @@ The example of this format is as follows.
## Training ## Training
We use `python train.py -y 0 --model_arch 0` with the data in `./data/classification` to train a DSSM model for classification. We use `python train.py -y 0 --model_arch 0` with the data in `./data/classification` to train a DSSM model for classification. The paremeters to execute the script `train.py` can be found by execution `python infer.py --help`. Some important parameters are:
```
usage: train.py [-h] [-i TRAIN_DATA_PATH] [-t TEST_DATA_PATH]
[-s SOURCE_DIC_PATH] [--target_dic_path TARGET_DIC_PATH]
[-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE -a MODEL_ARCH
[--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
[--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
[--num_workers NUM_WORKERS] [--use_gpu USE_GPU] [-c CLASS_NUM]
[--model_output_prefix MODEL_OUTPUT_PREFIX]
[-g NUM_BATCHES_TO_LOG] [-e NUM_BATCHES_TO_TEST]
[-z NUM_BATCHES_TO_SAVE_MODEL]
PaddlePaddle DSSM example
optional arguments:
-h, --help show this help message and exit
-i TRAIN_DATA_PATH, --train_data_path TRAIN_DATA_PATH
path of training dataset
-t TEST_DATA_PATH, --test_data_path TEST_DATA_PATH
path of testing dataset
-s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
path of the source's word dic
--target_dic_path TARGET_DIC_PATH
path of the target's word dic, if not set, the
`source_dic_path` will be used
-b BATCH_SIZE, --batch_size BATCH_SIZE
size of mini-batch (default:32)
-p NUM_PASSES, --num_passes NUM_PASSES
number of passes to run(default:10)
-y MODEL_TYPE, --model_type MODEL_TYPE
model type, 0 for classification, 1 for pairwise rank,
2 for regression (default: classification)
-a MODEL_ARCH, --model_arch MODEL_ARCH
model architecture, 1 for CNN, 0 for FC, 2 for RNN
--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
whether to share network parameters between source and
target
--share_embed SHARE_EMBED
whether to share word embedding between source and
target
--dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32',
which means create a 4-layer dnn, demention of each
layer is 256, 128, 64 and 32
--num_workers NUM_WORKERS
num worker threads, default 1
--use_gpu USE_GPU whether to use GPU devices (default: False)
-c CLASS_NUM, --class_num CLASS_NUM
number of categories for classification task.
--model_output_prefix MODEL_OUTPUT_PREFIX
prefix of the path for model to store, (default: ./)
-g NUM_BATCHES_TO_LOG, --num_batches_to_log NUM_BATCHES_TO_LOG
number of batches to output train log, (default: 100)
-e NUM_BATCHES_TO_TEST, --num_batches_to_test NUM_BATCHES_TO_TEST
number of batches to test, (default: 200)
-z NUM_BATCHES_TO_SAVE_MODEL, --num_batches_to_save_model NUM_BATCHES_TO_SAVE_MODEL
number of batches to output model, (default: 400)
```
Parameter description:
- `train_data_path` Training data path - `train_data_path` Training data path
- `test_data_path` Test data path, optional - `test_data_path` Test data path, optional
...@@ -418,48 +251,8 @@ Parameter description: ...@@ -418,48 +251,8 @@ Parameter description:
- `dnn_dims` The dimension of each layer of the model is set, the default is `256,128,64,32`,with 4 layers. - `dnn_dims` The dimension of each layer of the model is set, the default is `256,128,64,32`,with 4 layers.
## To predict using the trained model ## To predict using the trained model
```
usage: infer.py [-h] --model_path MODEL_PATH -i DATA_PATH -o
PREDICTION_OUTPUT_PATH -y MODEL_TYPE [-s SOURCE_DIC_PATH]
[--target_dic_path TARGET_DIC_PATH] -a MODEL_ARCH
[--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
[--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
[-c CLASS_NUM]
PaddlePaddle DSSM infer
optional arguments:
-h, --help show this help message and exit
--model_path MODEL_PATH
path of model parameters file
-i DATA_PATH, --data_path DATA_PATH
path of the dataset to infer
-o PREDICTION_OUTPUT_PATH, --prediction_output_path PREDICTION_OUTPUT_PATH
path to output the prediction
-y MODEL_TYPE, --model_type MODEL_TYPE
model type, 0 for classification, 1 for pairwise rank,
2 for regression (default: classification)
-s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
path of the source's word dic
--target_dic_path TARGET_DIC_PATH
path of the target's word dic, if not set, the
`source_dic_path` will be used
-a MODEL_ARCH, --model_arch MODEL_ARCH
model architecture, 1 for CNN, 0 for FC, 2 for RNN
--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
whether to share network parameters between source and
target
--share_embed SHARE_EMBED
whether to share word embedding between source and
target
--dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32',
which means create a 4-layer dnn, demention of each
layer is 256, 128, 64 and 32
-c CLASS_NUM, --class_num CLASS_NUM
number of categories for classification task.
```
Important parameters are The paremeters to execute the script `infer.py` can be found by execution `python infer.py --help`. Some important parameters are:
- `data_path` Path for the data to predict - `data_path` Path for the data to predict
- `prediction_output_path` Prediction output path - `prediction_output_path` Prediction output path
......
...@@ -107,10 +107,11 @@ In below, we describe how to train DSSM model in PaddlePaddle. All the codes are ...@@ -107,10 +107,11 @@ In below, we describe how to train DSSM model in PaddlePaddle. All the codes are
### Create a word vector table for the text ### Create a word vector table for the text
```python ```python
def create_embedding(self, input, prefix=''): def create_embedding(self, input, prefix=''):
''' """
Create an embedding table whose name has a `prefix`. Create word embedding. The `prefix` is added in front of the name of
''' embedding"s learnable parameter.
logger.info("create embedding table [%s] which dimention is %d" % """
logger.info("Create embedding table [%s] whose dimention is %d" %
(prefix, self.dnn_dims[0])) (prefix, self.dnn_dims[0]))
emb = paddle.layer.embedding( emb = paddle.layer.embedding(
input=input, input=input,
...@@ -124,14 +125,15 @@ Since the input (embedding table) is a list of the IDs of the words correspondin ...@@ -124,14 +125,15 @@ Since the input (embedding table) is a list of the IDs of the words correspondin
### CNN implementation ### CNN implementation
```python ```python
def create_cnn(self, emb, prefix=''): def create_cnn(self, emb, prefix=''):
'''
"""
A multi-layer CNN. A multi-layer CNN.
:param emb: The word embedding.
:type emb: paddle.layer
:param prefix: The prefix will be added to of layers' names.
:type prefix: str
"""
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `cnn` parts.
'''
def create_conv(context_len, hidden_size, prefix): def create_conv(context_len, hidden_size, prefix):
key = "%s_%d_%d" % (prefix, context_len, hidden_size) key = "%s_%d_%d" % (prefix, context_len, hidden_size)
conv = paddle.networks.sequence_conv_pool( conv = paddle.networks.sequence_conv_pool(
...@@ -139,15 +141,13 @@ def create_cnn(self, emb, prefix=''): ...@@ -139,15 +141,13 @@ def create_cnn(self, emb, prefix=''):
context_len=context_len, context_len=context_len,
hidden_size=hidden_size, hidden_size=hidden_size,
# set parameter attr for parameter sharing # set parameter attr for parameter sharing
context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'), context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"),
fc_param_attr=ParamAttr(name=key + '_fc.w'), fc_param_attr=ParamAttr(name=key + "_fc.w"),
fc_bias_attr=ParamAttr(name=key + '_fc.b'), fc_bias_attr=ParamAttr(name=key + "_fc.b"),
pool_bias_attr=ParamAttr(name=key + '_pool.b')) pool_bias_attr=ParamAttr(name=key + "_pool.b"))
return conv return conv
logger.info('create a sequence_conv_pool which context width is 3')
conv_3 = create_conv(3, self.dnn_dims[1], "cnn") conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
logger.info('create a sequence_conv_pool which context width is 4')
conv_4 = create_conv(4, self.dnn_dims[1], "cnn") conv_4 = create_conv(4, self.dnn_dims[1], "cnn")
return conv_3, conv_4 return conv_3, conv_4
``` ```
...@@ -160,9 +160,9 @@ RNN is suitable for learning variable length of the information ...@@ -160,9 +160,9 @@ RNN is suitable for learning variable length of the information
```python ```python
def create_rnn(self, emb, prefix=''): def create_rnn(self, emb, prefix=''):
''' """
A GRU sentence vector learner. A GRU sentence vector learner.
''' """
gru = paddle.networks.simple_gru( gru = paddle.networks.simple_gru(
input=emb, input=emb,
size=self.dnn_dims[1], size=self.dnn_dims[1],
...@@ -178,14 +178,15 @@ def create_rnn(self, emb, prefix=''): ...@@ -178,14 +178,15 @@ def create_rnn(self, emb, prefix=''):
```python ```python
def create_fc(self, emb, prefix=''): def create_fc(self, emb, prefix=''):
'''
"""
A multi-layer fully connected neural networks. A multi-layer fully connected neural networks.
:param emb: The output of the embedding layer
:type emb: paddle.layer
:param prefix: A prefix will be added to the layers' names.
:type prefix: str
"""
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `fc` parts.
'''
_input_layer = paddle.layer.pooling( _input_layer = paddle.layer.pooling(
input=emb, pooling_type=paddle.pooling.Max()) input=emb, pooling_type=paddle.pooling.Max())
fc = paddle.layer.fc( fc = paddle.layer.fc(
...@@ -202,13 +203,10 @@ In the construction of FC, we use `paddle.layer.pooling` for the maximum pooling ...@@ -202,13 +203,10 @@ In the construction of FC, we use `paddle.layer.pooling` for the maximum pooling
```python ```python
def create_dnn(self, sent_vec, prefix): def create_dnn(self, sent_vec, prefix):
# if more than three layers exists, a fc layer will be added.
if len(self.dnn_dims) > 1: if len(self.dnn_dims) > 1:
_input_layer = sent_vec _input_layer = sent_vec
for id, dim in enumerate(self.dnn_dims[1:]): for id, dim in enumerate(self.dnn_dims[1:]):
name = "%s_fc_%d_%d" % (prefix, id, dim) name = "%s_fc_%d_%d" % (prefix, id, dim)
logger.info("create fc layer [%s] which dimention is %d" %
(name, dim))
fc = paddle.layer.fc( fc = paddle.layer.fc(
input=_input_layer, input=_input_layer,
size=dim, size=dim,
...@@ -222,117 +220,12 @@ def create_dnn(self, sent_vec, prefix): ...@@ -222,117 +220,12 @@ def create_dnn(self, sent_vec, prefix):
### Classification / Regression ### Classification / Regression
The structure of classification and regression is similar. Below function can be used for both tasks. The structure of classification and regression is similar. Below function can be used for both tasks.
Please check the function `_build_classification_or_regression_model` in [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) for detail implementation.
```python
def _build_classification_or_regression_model(self, is_classification):
'''
Build a classification/regression model, and the cost is returned.
A Classification has 3 inputs:
- source sentence
- target sentence
- classification label
'''
# prepare inputs.
assert self.class_num
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
target = paddle.layer.data(
name='target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input',
type=paddle.data_type.integer_value(self.class_num)
if is_classification else paddle.data_type.dense_input)
prefixs = '_ _'.split(
) if self.share_semantic_generator else 'source target'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target'.split()
word_vecs = []
for id, input in enumerate([source, target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
if is_classification:
concated_vector = paddle.layer.concat(semantics)
prediction = paddle.layer.fc(
input=concated_vector,
size=self.class_num,
act=paddle.activation.Softmax())
cost = paddle.layer.classification_cost(
input=prediction, label=label)
else:
prediction = paddle.layer.cos_sim(*semantics)
cost = paddle.layer.square_error_cost(prediction, label)
if not self.is_infer:
return cost, prediction, label
return prediction
```
### Pairwise Rank ### Pairwise Rank
Please check the function `_build_rank_model` in [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) for implementation.
```python
def _build_rank_model(self):
'''
Build a pairwise rank model, and the cost is returned.
A pairwise rank model has 3 inputs:
- source sentence
- left_target sentence
- right_target sentence
- label, 1 if left_target should be sorted in front of right_target, otherwise 0.
'''
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
left_target = paddle.layer.data(
name='left_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
right_target = paddle.layer.data(
name='right_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input', type=paddle.data_type.integer_value(1))
prefixs = '_ _ _'.split(
) if self.share_semantic_generator else 'source target target'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target target'.split()
word_vecs = []
for id, input in enumerate([source, left_target, right_target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
# cossim score of source and left_target
left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
# cossim score of source and right target
right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
# rank cost
cost = paddle.layer.rank_cost(left_score, right_score, label=label)
# prediction = left_score - right_score
# but this operator is not supported currently.
# so AUC will not used.
return cost, None, None
```
## Data Format ## Data Format
Below is a simple example for the data in `./data` Below is a simple example for the data in `./data`
...@@ -389,67 +282,7 @@ The example of this format is as follows. ...@@ -389,67 +282,7 @@ The example of this format is as follows.
## Training ## Training
We use `python train.py -y 0 --model_arch 0` with the data in `./data/classification` to train a DSSM model for classification. We use `python train.py -y 0 --model_arch 0` with the data in `./data/classification` to train a DSSM model for classification. The paremeters to execute the script `train.py` can be found by execution `python infer.py --help`. Some important parameters are:
```
usage: train.py [-h] [-i TRAIN_DATA_PATH] [-t TEST_DATA_PATH]
[-s SOURCE_DIC_PATH] [--target_dic_path TARGET_DIC_PATH]
[-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE -a MODEL_ARCH
[--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
[--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
[--num_workers NUM_WORKERS] [--use_gpu USE_GPU] [-c CLASS_NUM]
[--model_output_prefix MODEL_OUTPUT_PREFIX]
[-g NUM_BATCHES_TO_LOG] [-e NUM_BATCHES_TO_TEST]
[-z NUM_BATCHES_TO_SAVE_MODEL]
PaddlePaddle DSSM example
optional arguments:
-h, --help show this help message and exit
-i TRAIN_DATA_PATH, --train_data_path TRAIN_DATA_PATH
path of training dataset
-t TEST_DATA_PATH, --test_data_path TEST_DATA_PATH
path of testing dataset
-s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
path of the source's word dic
--target_dic_path TARGET_DIC_PATH
path of the target's word dic, if not set, the
`source_dic_path` will be used
-b BATCH_SIZE, --batch_size BATCH_SIZE
size of mini-batch (default:32)
-p NUM_PASSES, --num_passes NUM_PASSES
number of passes to run(default:10)
-y MODEL_TYPE, --model_type MODEL_TYPE
model type, 0 for classification, 1 for pairwise rank,
2 for regression (default: classification)
-a MODEL_ARCH, --model_arch MODEL_ARCH
model architecture, 1 for CNN, 0 for FC, 2 for RNN
--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
whether to share network parameters between source and
target
--share_embed SHARE_EMBED
whether to share word embedding between source and
target
--dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32',
which means create a 4-layer dnn, demention of each
layer is 256, 128, 64 and 32
--num_workers NUM_WORKERS
num worker threads, default 1
--use_gpu USE_GPU whether to use GPU devices (default: False)
-c CLASS_NUM, --class_num CLASS_NUM
number of categories for classification task.
--model_output_prefix MODEL_OUTPUT_PREFIX
prefix of the path for model to store, (default: ./)
-g NUM_BATCHES_TO_LOG, --num_batches_to_log NUM_BATCHES_TO_LOG
number of batches to output train log, (default: 100)
-e NUM_BATCHES_TO_TEST, --num_batches_to_test NUM_BATCHES_TO_TEST
number of batches to test, (default: 200)
-z NUM_BATCHES_TO_SAVE_MODEL, --num_batches_to_save_model NUM_BATCHES_TO_SAVE_MODEL
number of batches to output model, (default: 400)
```
Parameter description:
- `train_data_path` Training data path - `train_data_path` Training data path
- `test_data_path` Test data path, optional - `test_data_path` Test data path, optional
...@@ -460,48 +293,8 @@ Parameter description: ...@@ -460,48 +293,8 @@ Parameter description:
- `dnn_dims` The dimension of each layer of the model is set, the default is `256,128,64,32`,with 4 layers. - `dnn_dims` The dimension of each layer of the model is set, the default is `256,128,64,32`,with 4 layers.
## To predict using the trained model ## To predict using the trained model
```
usage: infer.py [-h] --model_path MODEL_PATH -i DATA_PATH -o
PREDICTION_OUTPUT_PATH -y MODEL_TYPE [-s SOURCE_DIC_PATH]
[--target_dic_path TARGET_DIC_PATH] -a MODEL_ARCH
[--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
[--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
[-c CLASS_NUM]
PaddlePaddle DSSM infer
optional arguments:
-h, --help show this help message and exit
--model_path MODEL_PATH
path of model parameters file
-i DATA_PATH, --data_path DATA_PATH
path of the dataset to infer
-o PREDICTION_OUTPUT_PATH, --prediction_output_path PREDICTION_OUTPUT_PATH
path to output the prediction
-y MODEL_TYPE, --model_type MODEL_TYPE
model type, 0 for classification, 1 for pairwise rank,
2 for regression (default: classification)
-s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
path of the source's word dic
--target_dic_path TARGET_DIC_PATH
path of the target's word dic, if not set, the
`source_dic_path` will be used
-a MODEL_ARCH, --model_arch MODEL_ARCH
model architecture, 1 for CNN, 0 for FC, 2 for RNN
--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
whether to share network parameters between source and
target
--share_embed SHARE_EMBED
whether to share word embedding between source and
target
--dnn_dims DNN_DIMS dimentions of dnn layers, default is '256,128,64,32',
which means create a 4-layer dnn, demention of each
layer is 256, 128, 64 and 32
-c CLASS_NUM, --class_num CLASS_NUM
number of categories for classification task.
```
Important parameters are The paremeters to execute the script `infer.py` can be found by execution `python infer.py --help`. Some important parameters are:
- `data_path` Path for the data to predict - `data_path` Path for the data to predict
- `prediction_output_path` Prediction output path - `prediction_output_path` Prediction output path
......
...@@ -9,30 +9,27 @@ from utils import logger, ModelType, ModelArch, load_dic ...@@ -9,30 +9,27 @@ from utils import logger, ModelType, ModelArch, load_dic
parser = argparse.ArgumentParser(description="PaddlePaddle DSSM infer") parser = argparse.ArgumentParser(description="PaddlePaddle DSSM infer")
parser.add_argument( parser.add_argument(
"--model_path", "--model_path", type=str, required=True, help="The path of trained model.")
type=str,
required=True,
help="path of model parameters file")
parser.add_argument( parser.add_argument(
"-i", "-i",
"--data_path", "--data_path",
type=str, type=str,
required=True, required=True,
help="path of the dataset to infer") help="The path of the data for inferring.")
parser.add_argument( parser.add_argument(
"-o", "-o",
"--prediction_output_path", "--prediction_output_path",
type=str, type=str,
required=True, required=True,
help="path to output the prediction") help="The path to save the predictions.")
parser.add_argument( parser.add_argument(
"-y", "-y",
"--model_type", "--model_type",
type=int, type=int,
required=True, required=True,
default=ModelType.CLASSIFICATION_MODE, default=ModelType.CLASSIFICATION_MODE,
help=("model type, %d for classification, %d for pairwise rank, " help=("The model type: %d for classification, %d for pairwise rank, "
"%d for regression (default: classification)") % "%d for regression (default: classification).") %
(ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE, (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
ModelType.REGRESSION_MODE)) ModelType.REGRESSION_MODE))
parser.add_argument( parser.add_argument(
...@@ -40,13 +37,13 @@ parser.add_argument( ...@@ -40,13 +37,13 @@ parser.add_argument(
"--source_dic_path", "--source_dic_path",
type=str, type=str,
required=False, required=False,
help="path of the source's word dic") help="The path of the source's word dictionary.")
parser.add_argument( parser.add_argument(
"--target_dic_path", "--target_dic_path",
type=str, type=str,
required=False, required=False,
help=("path of the target's word dictionary, " help=("The path of the target's word dictionary, "
"if not set, the `source_dic_path` will be used")) "if this parameter is not set, the `source_dic_path` will be used."))
parser.add_argument( parser.add_argument(
"-a", "-a",
"--model_arch", "--model_arch",
...@@ -69,15 +66,15 @@ parser.add_argument( ...@@ -69,15 +66,15 @@ parser.add_argument(
"--dnn_dims", "--dnn_dims",
type=str, type=str,
default="256,128,64,32", default="256,128,64,32",
help=("dimentions of dnn layers, default is `256,128,64,32`, " help=("The dimentions of dnn layers, default is `256,128,64,32`, "
"which means create a 4-layer dnn, " "which means a dnn with 4 layers with "
"demention of each layer is 256, 128, 64 and 32")) "dmentions 256, 128, 64 and 32 will be created."))
parser.add_argument( parser.add_argument(
"-c", "-c",
"--class_num", "--class_num",
type=int, type=int,
default=0, default=0,
help="number of categories for classification task.") help="The number of categories for classification task.")
args = parser.parse_args() args = parser.parse_args()
args.model_type = ModelType(args.model_type) args.model_type = ModelType(args.model_type)
......
...@@ -9,120 +9,129 @@ from utils import TaskType, load_dic, logger, ModelType, ModelArch, display_args ...@@ -9,120 +9,129 @@ from utils import TaskType, load_dic, logger, ModelType, ModelArch, display_args
parser = argparse.ArgumentParser(description="PaddlePaddle DSSM example") parser = argparse.ArgumentParser(description="PaddlePaddle DSSM example")
parser.add_argument( parser.add_argument(
'-i', "-i",
'--train_data_path', "--train_data_path",
type=str, type=str,
required=False, required=False,
help="path of training dataset") help="The path of training data.")
parser.add_argument( parser.add_argument(
'-t', "-t",
'--test_data_path', "--test_data_path",
type=str, type=str,
required=False, required=False,
help="path of testing dataset") help="The path of testing data.")
parser.add_argument( parser.add_argument(
'-s', "-s",
'--source_dic_path', "--source_dic_path",
type=str, type=str,
required=False, required=False,
help="path of the source's word dic") help="The path of the source's word dictionary.")
parser.add_argument( parser.add_argument(
'--target_dic_path', "--target_dic_path",
type=str, type=str,
required=False, required=False,
help=("path of the target's word dictionary, " help=("The path of the target's word dictionary, "
"if not set, the `source_dic_path` will be used")) "if this parameter is not set, the `source_dic_path` will be used"))
parser.add_argument( parser.add_argument(
'-b', "-b",
'--batch_size', "--batch_size",
type=int, type=int,
default=32, default=32,
help="size of mini-batch (default:32)") help="The size of mini-batch (default:32).")
parser.add_argument( parser.add_argument(
'-p', "-p",
'--num_passes', "--num_passes",
type=int, type=int,
default=10, default=10,
help="number of passes to run(default:10)") help="The number of passes to run(default:10).")
parser.add_argument( parser.add_argument(
'-y', "-y",
'--model_type', "--model_type",
type=int, type=int,
required=True, required=True,
default=ModelType.CLASSIFICATION_MODE, default=ModelType.CLASSIFICATION_MODE,
help="model type, %d for classification, %d for pairwise rank, %d for regression (default: classification)" help=("model type, %d for classification, %d for pairwise rank, "
% (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE, "%d for regression (default: classification).") %
(ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
ModelType.REGRESSION_MODE)) ModelType.REGRESSION_MODE))
parser.add_argument( parser.add_argument(
'-a', "-a",
'--model_arch', "--model_arch",
type=int, type=int,
required=True, required=True,
default=ModelArch.CNN_MODE, default=ModelArch.CNN_MODE,
help="model architecture, %d for CNN, %d for FC, %d for RNN" % help="The model architecture, %d for CNN, %d for FC, %d for RNN." %
(ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE)) (ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE))
parser.add_argument( parser.add_argument(
'--share_network_between_source_target', "--share_network_between_source_target",
type=distutils.util.strtobool, type=distutils.util.strtobool,
default=False, default=False,
help="whether to share network parameters between source and target") help="Whether to share network parameters between source and target.")
parser.add_argument( parser.add_argument(
'--share_embed', "--share_embed",
type=distutils.util.strtobool, type=distutils.util.strtobool,
default=False, default=False,
help="whether to share word embedding between source and target") help="Whether to share word embedding between source and target.")
parser.add_argument( parser.add_argument(
'--dnn_dims', "--dnn_dims",
type=str, type=str,
default='256,128,64,32', default="256,128,64,32",
help="dimentions of dnn layers, default is '256,128,64,32', which means create a 4-layer dnn, demention of each layer is 256, 128, 64 and 32" help=("The dimentions of dnn layers, default is '256,128,64,32', "
) "which means create a 4-layer dnn. The dimention of each layer is "
"'256, 128, 64 and 32'."))
parser.add_argument( parser.add_argument(
'--num_workers', type=int, default=1, help="num worker threads, default 1") "--num_workers",
type=int,
default=1,
help="The number of worker threads, default 1.")
parser.add_argument( parser.add_argument(
'--use_gpu', "--use_gpu",
type=distutils.util.strtobool, type=distutils.util.strtobool,
default=False, default=False,
help="whether to use GPU devices (default: False)") help="Whether to use GPU devices (default: False)")
parser.add_argument( parser.add_argument(
'-c', "-c",
'--class_num', "--class_num",
type=int, type=int,
default=0, default=0,
help="number of categories for classification task.") help="The number of categories for classification task.")
parser.add_argument( parser.add_argument(
'--model_output_prefix', "--model_output_prefix",
type=str, type=str,
default="./", default="./",
help="prefix of the path for model to store, (default: ./)") help="The prefix of the path to store the trained models (default: ./).")
parser.add_argument( parser.add_argument(
'-g', "-g",
'--num_batches_to_log', "--num_batches_to_log",
type=int, type=int,
default=100, default=100,
help="number of batches to output train log, (default: 100)") help=("The log period. Every num_batches_to_test batches, "
"a training log will be printed. (default: 100)"))
parser.add_argument( parser.add_argument(
'-e', "-e",
'--num_batches_to_test', "--num_batches_to_test",
type=int, type=int,
default=200, default=200,
help="number of batches to test, (default: 200)") help=("The test period. Every num_batches_to_save_model batches, "
"the specified test sample will be test (default: 200)."))
parser.add_argument( parser.add_argument(
'-z', "-z",
'--num_batches_to_save_model', "--num_batches_to_save_model",
type=int, type=int,
default=400, default=400,
help="number of batches to output model, (default: 400)") help=("Every num_batches_to_save_model batches, "
"a trained model will be saved (default: 400)."))
# arguments check.
args = parser.parse_args() args = parser.parse_args()
args.model_type = ModelType(args.model_type) args.model_type = ModelType(args.model_type)
args.model_arch = ModelArch(args.model_arch) args.model_arch = ModelArch(args.model_arch)
if args.model_type.is_classification(): if args.model_type.is_classification():
assert args.class_num > 1, "--class_num should be set in classification task." assert args.class_num > 1, ("The parameter class_num should be set in "
"classification task.")
layer_dims = [int(i) for i in args.dnn_dims.split(',')] layer_dims = [int(i) for i in args.dnn_dims.split(",")]
args.target_dic_path = args.source_dic_path if not args.target_dic_path else args.target_dic_path args.target_dic_path = args.source_dic_path if not \
args.target_dic_path else args.target_dic_path
def train(train_data_path=None, def train(train_data_path=None,
...@@ -138,15 +147,15 @@ def train(train_data_path=None, ...@@ -138,15 +147,15 @@ def train(train_data_path=None,
class_num=None, class_num=None,
num_workers=1, num_workers=1,
use_gpu=False): use_gpu=False):
''' """
Train the DSSM. Train the DSSM.
''' """
default_train_path = './data/rank/train.txt' default_train_path = "./data/rank/train.txt"
default_test_path = './data/rank/test.txt' default_test_path = "./data/rank/test.txt"
default_dic_path = './data/vocab.txt' default_dic_path = "./data/vocab.txt"
if not model_type.is_rank(): if not model_type.is_rank():
default_train_path = './data/classification/train.txt' default_train_path = "./data/classification/train.txt"
default_test_path = './data/classification/test.txt' default_test_path = "./data/classification/test.txt"
use_default_data = not train_data_path use_default_data = not train_data_path
...@@ -200,19 +209,19 @@ def train(train_data_path=None, ...@@ -200,19 +209,19 @@ def train(train_data_path=None,
feeding = {} feeding = {}
if model_type.is_classification() or model_type.is_regression(): if model_type.is_classification() or model_type.is_regression():
feeding = {'source_input': 0, 'target_input': 1, 'label_input': 2} feeding = {"source_input": 0, "target_input": 1, "label_input": 2}
else: else:
feeding = { feeding = {
'source_input': 0, "source_input": 0,
'left_target_input': 1, "left_target_input": 1,
'right_target_input': 2, "right_target_input": 2,
'label_input': 3 "label_input": 3
} }
def _event_handler(event): def _event_handler(event):
''' """
Define batch handler Define batch handler
''' """
if isinstance(event, paddle.event.EndIteration): if isinstance(event, paddle.event.EndIteration):
# output train log # output train log
if event.batch_id % args.num_batches_to_log == 0: if event.batch_id % args.num_batches_to_log == 0:
...@@ -249,7 +258,7 @@ def train(train_data_path=None, ...@@ -249,7 +258,7 @@ def train(train_data_path=None,
logger.info("Training has finished.") logger.info("Training has finished.")
if __name__ == '__main__': if __name__ == "__main__":
display_args(args) display_args(args)
train( train(
train_data_path=args.train_data_path, train_data_path=args.train_data_path,
......
...@@ -8,7 +8,7 @@ logger.setLevel(logging.INFO) ...@@ -8,7 +8,7 @@ logger.setLevel(logging.INFO)
def mode_attr_name(mode): def mode_attr_name(mode):
return mode.upper() + '_MODE' return mode.upper() + "_MODE"
def create_attrs(cls): def create_attrs(cls):
...@@ -17,9 +17,9 @@ def create_attrs(cls): ...@@ -17,9 +17,9 @@ def create_attrs(cls):
def make_check_method(cls): def make_check_method(cls):
''' """
create methods for classes. create methods for classes.
''' """
def method(mode): def method(mode):
def _method(self): def _method(self):
...@@ -28,7 +28,7 @@ def make_check_method(cls): ...@@ -28,7 +28,7 @@ def make_check_method(cls):
return _method return _method
for id, mode in enumerate(cls.modes): for id, mode in enumerate(cls.modes):
setattr(cls, 'is_' + mode, method(mode)) setattr(cls, "is_" + mode, method(mode))
def make_create_method(cls): def make_create_method(cls):
...@@ -41,10 +41,10 @@ def make_create_method(cls): ...@@ -41,10 +41,10 @@ def make_create_method(cls):
return _method return _method
for id, mode in enumerate(cls.modes): for id, mode in enumerate(cls.modes):
setattr(cls, 'create_' + mode, method(mode)) setattr(cls, "create_" + mode, method(mode))
def make_str_method(cls, type_name='unk'): def make_str_method(cls, type_name="unk"):
def _str_(self): def _str_(self):
for mode in cls.modes: for mode in cls.modes:
if self.mode == getattr(cls, mode_attr_name(mode)): if self.mode == getattr(cls, mode_attr_name(mode)):
...@@ -53,9 +53,9 @@ def make_str_method(cls, type_name='unk'): ...@@ -53,9 +53,9 @@ def make_str_method(cls, type_name='unk'):
def _hash_(self): def _hash_(self):
return self.mode return self.mode
setattr(cls, '__str__', _str_) setattr(cls, "__str__", _str_)
setattr(cls, '__repr__', _str_) setattr(cls, "__repr__", _str_)
setattr(cls, '__hash__', _hash_) setattr(cls, "__hash__", _hash_)
cls.__name__ = type_name cls.__name__ = type_name
...@@ -65,7 +65,7 @@ def _init_(self, mode, cls): ...@@ -65,7 +65,7 @@ def _init_(self, mode, cls):
elif isinstance(mode, cls): elif isinstance(mode, cls):
self.mode = mode.mode self.mode = mode.mode
else: else:
raise Exception("wrong mode type, get type: %s, value: %s" % raise Exception("A wrong mode type, get type: %s, value: %s." %
(type(mode), mode)) (type(mode), mode))
...@@ -77,21 +77,21 @@ def build_mode_class(cls): ...@@ -77,21 +77,21 @@ def build_mode_class(cls):
class TaskType(object): class TaskType(object):
modes = 'train test infer'.split() modes = "train test infer".split()
def __init__(self, mode): def __init__(self, mode):
_init_(self, mode, TaskType) _init_(self, mode, TaskType)
class ModelType: class ModelType:
modes = 'classification rank regression'.split() modes = "classification rank regression".split()
def __init__(self, mode): def __init__(self, mode):
_init_(self, mode, ModelType) _init_(self, mode, ModelType)
class ModelArch: class ModelArch:
modes = 'fc cnn rnn'.split() modes = "fc cnn rnn".split()
def __init__(self, mode): def __init__(self, mode):
_init_(self, mode, ModelArch) _init_(self, mode, ModelArch)
...@@ -103,22 +103,16 @@ build_mode_class(ModelArch) ...@@ -103,22 +103,16 @@ build_mode_class(ModelArch)
def sent2ids(sent, vocab): def sent2ids(sent, vocab):
''' """
transform a sentence to a list of ids. transform a sentence to a list of ids.
"""
@sent: str
a sentence.
@vocab: dict
a word dic
'''
return [vocab.get(w, UNK) for w in sent.split()] return [vocab.get(w, UNK) for w in sent.split()]
def load_dic(path): def load_dic(path):
''' """
word dic format: The format of word dictionary : each line is a word.
each line is a word """
'''
dic = {} dic = {}
with open(path) as f: with open(path) as f:
for id, line in enumerate(f): for id, line in enumerate(f):
...@@ -128,13 +122,6 @@ def load_dic(path): ...@@ -128,13 +122,6 @@ def load_dic(path):
def display_args(args): def display_args(args):
logger.info("arguments passed by command line:") logger.info("The arguments passed by command line is :")
for k, v in sorted(v for v in vars(args).items()): for k, v in sorted(v for v in vars(args).items()):
logger.info("{}:\t{}".format(k, v)) logger.info("{}:\t{}".format(k, v))
if __name__ == '__main__':
t = TaskType(1)
t = TaskType.create_train()
print t
print 'is', t.is_train()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册