提交 0096515a 编写于 作者: P peterzhang2029

add config.py

上级 266b8eeb
...@@ -39,16 +39,25 @@ nest_group = paddle.layer.recurrent_group(input=[paddle.layer.SubsequenceInput(e ...@@ -39,16 +39,25 @@ nest_group = paddle.layer.recurrent_group(input=[paddle.layer.SubsequenceInput(e
CNN网络具体代码实现如下: CNN网络具体代码实现如下:
```python ```python
def cnn_cov_group(group_input, hidden_size): def cnn_cov_group(group_input, hidden_size):
"""
Covolution group definition
:param group_input: The input of this layer.
:type group_input: LayerOutput
:params hidden_size: Size of FC layer.
:type hidden_size: int
"""
conv3 = paddle.networks.sequence_conv_pool( conv3 = paddle.networks.sequence_conv_pool(
input=group_input, context_len=3, hidden_size=hidden_size) input=group_input, context_len=3, hidden_size=hidden_size)
conv4 = paddle.networks.sequence_conv_pool( conv4 = paddle.networks.sequence_conv_pool(
input=group_input, context_len=4, hidden_size=hidden_size) input=group_input, context_len=4, hidden_size=hidden_size)
output_group = paddle.layer.fc(input=[conv3, conv4],
linear_proj = paddle.layer.fc(input=[conv3, conv4],
size=hidden_size, size=hidden_size,
param_attr=paddle.attr.ParamAttr(name='_cov_value_weight'), param_attr=paddle.attr.ParamAttr(name='_cov_value_weight'),
bias_attr=paddle.attr.ParamAttr(name='_cov_value_bias'), bias_attr=paddle.attr.ParamAttr(name='_cov_value_bias'),
act=paddle.activation.Linear()) act=paddle.activation.Linear())
return output_group
return linear_proj
``` ```
PaddlePaddle 中已经封装好的带有池化的文本序列卷积模块:`paddle.networks.sequence_conv_pool`,可直接调用。 PaddlePaddle 中已经封装好的带有池化的文本序列卷积模块:`paddle.networks.sequence_conv_pool`,可直接调用。
...@@ -65,6 +74,33 @@ prob = paddle.layer.mixed(size=class_num, ...@@ -65,6 +74,33 @@ prob = paddle.layer.mixed(size=class_num,
pip install -r requirements.txt pip install -r requirements.txt
``` ```
## 指定训练配置参数
`config.py`脚本中包含训练配置和模型配置的参数设置, 示例代码如下:
```
class TrainerConfig(object):
# whether to use GPU for training
use_gpu = False
# the number of threads used in one machine
trainer_count = 1
# train batch size
batch_size = 32
...
class ModelConfig(object):
# embedding vector dimension
emb_size = 28
...
```
用户可以对具体参数进行设置实现训练, 例如通过设置 `use_gpu` 参数来指定是否使用 GPU
进行训练。
## 使用 PaddlePaddle 内置数据运行 ## 使用 PaddlePaddle 内置数据运行
### 训练 ### 训练
...@@ -88,15 +124,15 @@ python infer.py --model_path 'models/params_pass_00000.tar.gz' ...@@ -88,15 +124,15 @@ python infer.py --model_path 'models/params_pass_00000.tar.gz'
输入数据格式如下:每一行为一条样本,以 `\t` 分隔,第一列是类别标签,第二列是输入文本的内容。以下是两条示例数据: 输入数据格式如下:每一行为一条样本,以 `\t` 分隔,第一列是类别标签,第二列是输入文本的内容。以下是两条示例数据:
``` ```
1 This movie is very good. The actor is so handsome. positive This movie is very good. The actor is so handsome.
0 What a terrible movie. I waste so much time. negative What a terrible movie. I waste so much time.
``` ```
2.编写数据读取接口 2.编写数据读取接口
自定义数据读取接口只需编写一个 Python 生成器实现**从原始输入文本中解析一条训练样本**的逻辑。以下代码片段实现了读取原始数据返回类型为: `paddle.data_type.integer_value_sub_sequence``paddle.data_type.integer_value` 自定义数据读取接口只需编写一个 Python 生成器实现**从原始输入文本中解析一条训练样本**的逻辑。以下代码片段实现了读取原始数据返回类型为: `paddle.data_type.integer_value_sub_sequence``paddle.data_type.integer_value`
```python ```python
def train_reader(data_dir, word_dict): def train_reader(data_dir, word_dict, label_dict):
""" """
Reader interface for training data Reader interface for training data
...@@ -105,6 +141,8 @@ def train_reader(data_dir, word_dict): ...@@ -105,6 +141,8 @@ def train_reader(data_dir, word_dict):
:param word_dict: path of word dictionary, :param word_dict: path of word dictionary,
the dictionary must has a "UNK" in it. the dictionary must has a "UNK" in it.
:type word_dict: Python dict :type word_dict: Python dict
:param label_dict: path of label dictionary.
:type label_dict: Python dict
""" """
def reader(): def reader():
...@@ -128,19 +166,21 @@ def train_reader(data_dir, word_dict): ...@@ -128,19 +166,21 @@ def train_reader(data_dir, word_dict):
if sent_ids: if sent_ids:
doc_ids.append(sent_ids) doc_ids.append(sent_ids)
yield doc_ids, int(line_split[lbl_col]) yield doc_ids, label_dict[line_split[lbl_col]]
return reader return reader
``` ```
需要注意的是, 本例中以英文句号`'.'`作为分隔符, 将一段文本分隔为一定数量的句子, 且每个句子表示为对应词表的索引数组(`sent_ids`)。 由于当前样本的表示(`doc_ids`)中包含了该段文本的所有句子, 因此,它的类型为:`paddle.data_type.integer_value_sub_sequence` 需要注意的是, 本例中以英文句号`'.'`作为分隔符, 将一段文本分隔为一定数量的句子, 且每个句子表示为对应词表的索引数组(`sent_ids`)。 由于当前样本的表示(`doc_ids`)中包含了该段文本的所有句子, 因此,它的类型为:`paddle.data_type.integer_value_sub_sequence`
3.指定命令行参数进行训练 3.指定命令行参数进行训练
`train.py`训练脚本中包含以下参数: `train.py`训练脚本中包含以下参数:
``` ```
Options: Options:
--train_data_dir TEXT The path of training dataset (default: None). If this --train_data_dir TEXT The path of training dataset (default: None). If
parameter is not set, imdb dataset will be used. this parameter is not set, imdb dataset will be
used.
--test_data_dir TEXT The path of testing dataset (default: None). If this --test_data_dir TEXT The path of testing dataset (default: None). If this
parameter is not set, imdb dataset will be used. parameter is not set, imdb dataset will be used.
--word_dict_path TEXT The path of word dictionary (default: None). If this --word_dict_path TEXT The path of word dictionary (default: None). If this
...@@ -148,10 +188,11 @@ Options: ...@@ -148,10 +188,11 @@ Options:
this parameter is set, but the file does not exist, this parameter is set, but the file does not exist,
word dictionay will be built from the training data word dictionay will be built from the training data
automatically. automatically.
--class_num INTEGER The class number (default: 2). --label_dict_path TEXT The path of label dictionary (default: None).If this
--batch_size INTEGER The number of training examples in one batch parameter is not set, imdb dataset will be used. If
(default: 32). this parameter is set, but the file does not exist,
--num_passes INTEGER The number of passes to train (default: 10). label dictionay will be built from the training data
automatically.
--model_save_dir TEXT The path to save the trained models (default: --model_save_dir TEXT The path to save the trained models (default:
'models'). 'models').
--help Show this message and exit. --help Show this message and exit.
...@@ -159,7 +200,7 @@ Options: ...@@ -159,7 +200,7 @@ Options:
修改`train.py`脚本中的启动参数,可以直接运行本例。 以`data`目录下的示例数据为例,在终端执行: 修改`train.py`脚本中的启动参数,可以直接运行本例。 以`data`目录下的示例数据为例,在终端执行:
```bash ```bash
python train.py --train_data_dir 'data/train_data' --test_data_dir 'data/test_data' --word_dict_path 'dict.txt' python train.py --train_data_dir 'data/train_data' --test_data_dir 'data/test_data' --word_dict_path 'word_dict.txt' --label_dict_path 'label_dict.txt'
``` ```
即可对样例数据进行训练。 即可对样例数据进行训练。
...@@ -177,14 +218,15 @@ Options: ...@@ -177,14 +218,15 @@ Options:
--model_path TEXT The path of saved model. [required] --model_path TEXT The path of saved model. [required]
--word_dict_path TEXT The path of word dictionary (default: None). If this --word_dict_path TEXT The path of word dictionary (default: None). If this
parameter is not set, imdb dataset will be used. parameter is not set, imdb dataset will be used.
--class_num INTEGER The class number (default: 2). --label_dict_path TEXT The path of label dictionary (default: None).If this
parameter is not set, imdb dataset will be used.
--batch_size INTEGER The number of examples in one batch (default: 32). --batch_size INTEGER The number of examples in one batch (default: 32).
--help Show this message and exit. --help Show this message and exit.
``` ```
2.`data`目录下的示例数据为例,在终端执行: 2.`data`目录下的示例数据为例,在终端执行:
```bash ```bash
python infer.py --data_path 'data/infer.txt' --word_dict_path 'dict.txt' --model_path 'models/params_pass_00000.tar.gz' python infer.py --data_path 'data/infer.txt' --word_dict_path 'word_dict.txt' --label_dict_path 'label_dict.txt' --model_path 'models/params_pass_00000.tar.gz'
``` ```
即可对样例数据进行预测。 即可对样例数据进行预测。
__all__ = ["TrainerConfig", "ModelConfig"]
class TrainerConfig(object):
# whether to use GPU for training
use_gpu = False
# the number of threads used in one machine
trainer_count = 1
# train batch size
batch_size = 32
# number of pass during training
num_passes = 10
# learning rate for optimizer
learning_rate = 1e-3
# learning rate for L2Regularization
l2_learning_rate = 1e-3
# average_window for ModelAverage
average_window = 0.5
# buffer size for shuffling
buf_size = 1000
# log progress every log_period batches
log_period = 100
class ModelConfig(object):
# embedding vector dimension
emb_size = 28
# size of sentence vector representation and fc layer in cnn
hidden_size = 128
1 I liked the film. Some of the action scenes were very interesting, tense and well done. I especially liked the opening scene which had a semi truck in it. Also the film is funny is several parts. I'd give the film an 8 out of 10. positive I liked the film. Some of the action scenes were very interesting, tense and well done. I especially liked the opening scene which had a semi truck in it. Also the film is funny is several parts. I'd give the film an 8 out of 10.
0 The plot for Descent, if it actually can be called a plot, has two noteworthy events. One near the beginning - one at the end. Together these events make up maybe 5% of the total movie time. Everything (and I mean _everything_) in between is basically the director's desperate effort to fill in the minutes. negative The plot for Descent, if it actually can be called a plot, has two noteworthy events. One near the beginning - one at the end. Together these events make up maybe 5% of the total movie time. Everything (and I mean _everything_) in between is basically the director's desperate effort to fill in the minutes.
0 This film lacked something I couldn't put my finger on at first: charisma on the part of the leading actress. This inevitably translated to lack of chemistry when she shared the screen with her leading man. Even the romantic scenes came across as being merely the actors at play. negative This film lacked something I couldn't put my finger on at first: charisma on the part of the leading actress. This inevitably translated to lack of chemistry when she shared the screen with her leading man. Even the romantic scenes came across as being merely the actors at play.
0 I read the book a long time back and don't specifically remember the plot but do remember that I enjoyed it. Since I'm home sick on the couch it seemed like a good idea and Hey !! It is a Lifetime movie.<br /><br />The movie is populated with grade B actors and actresses.<br /><br />The female cast is right out of Desperate Housewives. negative I read the book a long time back and don't specifically remember the plot but do remember that I enjoyed it. Since I'm home sick on the couch it seemed like a good idea and Hey !! It is a Lifetime movie.<br /><br />The movie is populated with grade B actors and actresses.<br /><br />The female cast is right out of Desperate Housewives.
\ No newline at end of file \ No newline at end of file
0 It was a Sunday night and I was waiting for the advertised movie on TV. They said it was a comedy! The movie started, 10 minutes passed, after that 30 minutes and I didn't laugh not even once. The fact is that the movie ended and I didn't get even on echance to laugh. negative It was a Sunday night and I was waiting for the advertised movie on TV. They said it was a comedy! The movie started, 10 minutes passed, after that 30 minutes and I didn't laugh not even once. The fact is that the movie ended and I didn't get even on echance to laugh.
0 I saw this piece of garbage on AMC last night, and wonder how it could be considered in any way an American Movie Classic. It was awful in every way. How badly did Jack Lemmon, James Stewart and the rest of the cast need cash that they would even consider doing this movie? negative I saw this piece of garbage on AMC last night, and wonder how it could be considered in any way an American Movie Classic. It was awful in every way. How badly did Jack Lemmon, James Stewart and the rest of the cast need cash that they would even consider doing this movie?
1 its not as good as the first movie,but its a good solid movie its has good car chase scenes,on the remake of this movie there a story for are hero to drive fast as his trying to rush to the side of his ailing wife,the ending is great just a good fair movie to watch in my opinion. positive its not as good as the first movie,but its a good solid movie its has good car chase scenes,on the remake of this movie there a story for are hero to drive fast as his trying to rush to the side of his ailing wife,the ending is great just a good fair movie to watch in my opinion.
1 Rosalind Russell executes a power-house performance as Rosie Lord, a very wealthy woman with greedy heirs. With an Auntie Mame-type character, this actress can never go wrong. Her very-real terror at being in an insane assylum is a wonderful piece of acting. Everyone should watch this. positive Rosalind Russell executes a power-house performance as Rosie Lord, a very wealthy woman with greedy heirs. With an Auntie Mame-type character, this actress can never go wrong. Her very-real terror at being in an insane assylum is a wonderful piece of acting. Everyone should watch this.
\ No newline at end of file \ No newline at end of file
...@@ -81,16 +81,25 @@ nest_group = paddle.layer.recurrent_group(input=[paddle.layer.SubsequenceInput(e ...@@ -81,16 +81,25 @@ nest_group = paddle.layer.recurrent_group(input=[paddle.layer.SubsequenceInput(e
CNN网络具体代码实现如下: CNN网络具体代码实现如下:
```python ```python
def cnn_cov_group(group_input, hidden_size): def cnn_cov_group(group_input, hidden_size):
"""
Covolution group definition
:param group_input: The input of this layer.
:type group_input: LayerOutput
:params hidden_size: Size of FC layer.
:type hidden_size: int
"""
conv3 = paddle.networks.sequence_conv_pool( conv3 = paddle.networks.sequence_conv_pool(
input=group_input, context_len=3, hidden_size=hidden_size) input=group_input, context_len=3, hidden_size=hidden_size)
conv4 = paddle.networks.sequence_conv_pool( conv4 = paddle.networks.sequence_conv_pool(
input=group_input, context_len=4, hidden_size=hidden_size) input=group_input, context_len=4, hidden_size=hidden_size)
output_group = paddle.layer.fc(input=[conv3, conv4],
linear_proj = paddle.layer.fc(input=[conv3, conv4],
size=hidden_size, size=hidden_size,
param_attr=paddle.attr.ParamAttr(name='_cov_value_weight'), param_attr=paddle.attr.ParamAttr(name='_cov_value_weight'),
bias_attr=paddle.attr.ParamAttr(name='_cov_value_bias'), bias_attr=paddle.attr.ParamAttr(name='_cov_value_bias'),
act=paddle.activation.Linear()) act=paddle.activation.Linear())
return output_group
return linear_proj
``` ```
PaddlePaddle 中已经封装好的带有池化的文本序列卷积模块:`paddle.networks.sequence_conv_pool`,可直接调用。 PaddlePaddle 中已经封装好的带有池化的文本序列卷积模块:`paddle.networks.sequence_conv_pool`,可直接调用。
...@@ -107,6 +116,33 @@ prob = paddle.layer.mixed(size=class_num, ...@@ -107,6 +116,33 @@ prob = paddle.layer.mixed(size=class_num,
pip install -r requirements.txt pip install -r requirements.txt
``` ```
## 指定训练配置参数
`config.py`脚本中包含训练配置和模型配置的参数设置, 示例代码如下:
```
class TrainerConfig(object):
# whether to use GPU for training
use_gpu = False
# the number of threads used in one machine
trainer_count = 1
# train batch size
batch_size = 32
...
class ModelConfig(object):
# embedding vector dimension
emb_size = 28
...
```
用户可以对具体参数进行设置实现训练, 例如通过设置 `use_gpu` 参数来指定是否使用 GPU
进行训练。
## 使用 PaddlePaddle 内置数据运行 ## 使用 PaddlePaddle 内置数据运行
### 训练 ### 训练
...@@ -130,15 +166,15 @@ python infer.py --model_path 'models/params_pass_00000.tar.gz' ...@@ -130,15 +166,15 @@ python infer.py --model_path 'models/params_pass_00000.tar.gz'
输入数据格式如下:每一行为一条样本,以 `\t` 分隔,第一列是类别标签,第二列是输入文本的内容。以下是两条示例数据: 输入数据格式如下:每一行为一条样本,以 `\t` 分隔,第一列是类别标签,第二列是输入文本的内容。以下是两条示例数据:
``` ```
1 This movie is very good. The actor is so handsome. positive This movie is very good. The actor is so handsome.
0 What a terrible movie. I waste so much time. negative What a terrible movie. I waste so much time.
``` ```
2.编写数据读取接口 2.编写数据读取接口
自定义数据读取接口只需编写一个 Python 生成器实现**从原始输入文本中解析一条训练样本**的逻辑。以下代码片段实现了读取原始数据返回类型为: `paddle.data_type.integer_value_sub_sequence` 和 `paddle.data_type.integer_value` 自定义数据读取接口只需编写一个 Python 生成器实现**从原始输入文本中解析一条训练样本**的逻辑。以下代码片段实现了读取原始数据返回类型为: `paddle.data_type.integer_value_sub_sequence` 和 `paddle.data_type.integer_value`
```python ```python
def train_reader(data_dir, word_dict): def train_reader(data_dir, word_dict, label_dict):
""" """
Reader interface for training data Reader interface for training data
...@@ -147,6 +183,8 @@ def train_reader(data_dir, word_dict): ...@@ -147,6 +183,8 @@ def train_reader(data_dir, word_dict):
:param word_dict: path of word dictionary, :param word_dict: path of word dictionary,
the dictionary must has a "UNK" in it. the dictionary must has a "UNK" in it.
:type word_dict: Python dict :type word_dict: Python dict
:param label_dict: path of label dictionary.
:type label_dict: Python dict
""" """
def reader(): def reader():
...@@ -170,19 +208,21 @@ def train_reader(data_dir, word_dict): ...@@ -170,19 +208,21 @@ def train_reader(data_dir, word_dict):
if sent_ids: if sent_ids:
doc_ids.append(sent_ids) doc_ids.append(sent_ids)
yield doc_ids, int(line_split[lbl_col]) yield doc_ids, label_dict[line_split[lbl_col]]
return reader return reader
``` ```
需要注意的是, 本例中以英文句号`'.'`作为分隔符, 将一段文本分隔为一定数量的句子, 且每个句子表示为对应词表的索引数组(`sent_ids`)。 由于当前样本的表示(`doc_ids`)中包含了该段文本的所有句子, 因此,它的类型为:`paddle.data_type.integer_value_sub_sequence`。 需要注意的是, 本例中以英文句号`'.'`作为分隔符, 将一段文本分隔为一定数量的句子, 且每个句子表示为对应词表的索引数组(`sent_ids`)。 由于当前样本的表示(`doc_ids`)中包含了该段文本的所有句子, 因此,它的类型为:`paddle.data_type.integer_value_sub_sequence`。
3.指定命令行参数进行训练 3.指定命令行参数进行训练
`train.py`训练脚本中包含以下参数: `train.py`训练脚本中包含以下参数:
``` ```
Options: Options:
--train_data_dir TEXT The path of training dataset (default: None). If this --train_data_dir TEXT The path of training dataset (default: None). If
parameter is not set, imdb dataset will be used. this parameter is not set, imdb dataset will be
used.
--test_data_dir TEXT The path of testing dataset (default: None). If this --test_data_dir TEXT The path of testing dataset (default: None). If this
parameter is not set, imdb dataset will be used. parameter is not set, imdb dataset will be used.
--word_dict_path TEXT The path of word dictionary (default: None). If this --word_dict_path TEXT The path of word dictionary (default: None). If this
...@@ -190,10 +230,11 @@ Options: ...@@ -190,10 +230,11 @@ Options:
this parameter is set, but the file does not exist, this parameter is set, but the file does not exist,
word dictionay will be built from the training data word dictionay will be built from the training data
automatically. automatically.
--class_num INTEGER The class number (default: 2). --label_dict_path TEXT The path of label dictionary (default: None).If this
--batch_size INTEGER The number of training examples in one batch parameter is not set, imdb dataset will be used. If
(default: 32). this parameter is set, but the file does not exist,
--num_passes INTEGER The number of passes to train (default: 10). label dictionay will be built from the training data
automatically.
--model_save_dir TEXT The path to save the trained models (default: --model_save_dir TEXT The path to save the trained models (default:
'models'). 'models').
--help Show this message and exit. --help Show this message and exit.
...@@ -201,7 +242,7 @@ Options: ...@@ -201,7 +242,7 @@ Options:
修改`train.py`脚本中的启动参数,可以直接运行本例。 以`data`目录下的示例数据为例,在终端执行: 修改`train.py`脚本中的启动参数,可以直接运行本例。 以`data`目录下的示例数据为例,在终端执行:
```bash ```bash
python train.py --train_data_dir 'data/train_data' --test_data_dir 'data/test_data' --word_dict_path 'dict.txt' python train.py --train_data_dir 'data/train_data' --test_data_dir 'data/test_data' --word_dict_path 'word_dict.txt' --label_dict_path 'label_dict.txt'
``` ```
即可对样例数据进行训练。 即可对样例数据进行训练。
...@@ -219,14 +260,15 @@ Options: ...@@ -219,14 +260,15 @@ Options:
--model_path TEXT The path of saved model. [required] --model_path TEXT The path of saved model. [required]
--word_dict_path TEXT The path of word dictionary (default: None). If this --word_dict_path TEXT The path of word dictionary (default: None). If this
parameter is not set, imdb dataset will be used. parameter is not set, imdb dataset will be used.
--class_num INTEGER The class number (default: 2). --label_dict_path TEXT The path of label dictionary (default: None).If this
parameter is not set, imdb dataset will be used.
--batch_size INTEGER The number of examples in one batch (default: 32). --batch_size INTEGER The number of examples in one batch (default: 32).
--help Show this message and exit. --help Show this message and exit.
``` ```
2.以`data`目录下的示例数据为例,在终端执行: 2.以`data`目录下的示例数据为例,在终端执行:
```bash ```bash
python infer.py --data_path 'data/infer.txt' --word_dict_path 'dict.txt' --model_path 'models/params_pass_00000.tar.gz' python infer.py --data_path 'data/infer.txt' --word_dict_path 'word_dict.txt' --label_dict_path 'label_dict.txt' --model_path 'models/params_pass_00000.tar.gz'
``` ```
即可对样例数据进行预测。 即可对样例数据进行预测。
......
...@@ -6,8 +6,8 @@ import click ...@@ -6,8 +6,8 @@ import click
import paddle.v2 as paddle import paddle.v2 as paddle
import reader import reader
from network_conf import nest_net from network_conf import nested_net
from utils import logger, load_dict from utils import logger, load_dict, load_reverse_dict
@click.command('infer') @click.command('infer')
...@@ -26,14 +26,18 @@ from utils import logger, load_dict ...@@ -26,14 +26,18 @@ from utils import logger, load_dict
help=("The path of word dictionary (default: None). " help=("The path of word dictionary (default: None). "
"If this parameter is not set, imdb dataset will be used.")) "If this parameter is not set, imdb dataset will be used."))
@click.option( @click.option(
"--class_num", type=int, default=2, help="The class number (default: 2).") "--label_dict_path",
type=str,
default=None,
help=("The path of label dictionary (default: None)."
"If this parameter is not set, imdb dataset will be used. "))
@click.option( @click.option(
"--batch_size", "--batch_size",
type=int, type=int,
default=32, default=32,
help="The number of examples in one batch (default: 32).") help="The number of examples in one batch (default: 32).")
def infer(data_path, model_path, word_dict_path, batch_size, class_num): def infer(data_path, model_path, word_dict_path, batch_size, label_dict_path):
def _infer_a_batch(inferer, test_batch, ids_2_word): def _infer_a_batch(inferer, test_batch, ids_2_word, ids_2_label):
probs = inferer.infer(input=test_batch, field=["value"]) probs = inferer.infer(input=test_batch, field=["value"])
assert len(probs) == len(test_batch) assert len(probs) == len(test_batch)
for word_ids, prob in zip(test_batch, probs): for word_ids, prob in zip(test_batch, probs):
...@@ -41,7 +45,7 @@ def infer(data_path, model_path, word_dict_path, batch_size, class_num): ...@@ -41,7 +45,7 @@ def infer(data_path, model_path, word_dict_path, batch_size, class_num):
for sent in word_ids[0]: for sent in word_ids[0]:
sent_ids.extend(sent) sent_ids.extend(sent)
word_text = " ".join([ids_2_word[id] for id in sent_ids]) word_text = " ".join([ids_2_word[id] for id in sent_ids])
print("%s\t%s\t%s" % (prob.argmax(), print("%s\t%s\t%s" % (ids_2_label[prob.argmax()],
" ".join(["{:0.4f}".format(p) " ".join(["{:0.4f}".format(p)
for p in prob]), word_text)) for p in prob]), word_text))
...@@ -53,25 +57,30 @@ def infer(data_path, model_path, word_dict_path, batch_size, class_num): ...@@ -53,25 +57,30 @@ def infer(data_path, model_path, word_dict_path, batch_size, class_num):
word_dict = reader.imdb_word_dict() word_dict = reader.imdb_word_dict()
word_reverse_dict = dict((value, key) word_reverse_dict = dict((value, key)
for key, value in word_dict.iteritems()) for key, value in word_dict.iteritems())
label_reverse_dict = {0: "positive", 1: "negative"}
test_reader = reader.imdb_test(word_dict) test_reader = reader.imdb_test(word_dict)
class_num = 2 class_num = 2
else: else:
assert os.path.exists( assert os.path.exists(
word_dict_path), "The word dictionary file does not exist" word_dict_path), "The word dictionary file does not exist"
assert os.path.exists(
label_dict_path), "The label dictionary file does not exist"
word_dict = load_dict(word_dict_path) word_dict = load_dict(word_dict_path)
word_reverse_dict = dict((value, key) word_reverse_dict = dict((value, key)
for key, value in word_dict.iteritems()) for key, value in word_dict.iteritems())
label_reverse_dict = load_reverse_dict(label_dict_path)
class_num = len(label_reverse_dict)
test_reader = reader.infer_reader(data_path, word_dict)() test_reader = reader.infer_reader(data_path, word_dict)()
dict_dim = len(word_dict) dict_dim = len(word_dict)
prob_layer = nest_net(dict_dim, class_num=class_num, is_infer=True) prob_layer = nested_net(dict_dim, class_num, is_infer=True)
# initialize PaddlePaddle # initialize PaddlePaddle.
paddle.init(use_gpu=True, trainer_count=4) paddle.init(use_gpu=False, trainer_count=1)
# load the trained models # load the trained models.
parameters = paddle.parameters.Parameters.from_tar( parameters = paddle.parameters.Parameters.from_tar(
gzip.open(model_path, "r")) gzip.open(model_path, "r"))
inferer = paddle.inference.Inference( inferer = paddle.inference.Inference(
...@@ -81,11 +90,13 @@ def infer(data_path, model_path, word_dict_path, batch_size, class_num): ...@@ -81,11 +90,13 @@ def infer(data_path, model_path, word_dict_path, batch_size, class_num):
for idx, item in enumerate(test_reader): for idx, item in enumerate(test_reader):
test_batch.append([item[0]]) test_batch.append([item[0]])
if len(test_batch) == batch_size: if len(test_batch) == batch_size:
_infer_a_batch(inferer, test_batch, word_reverse_dict) _infer_a_batch(inferer, test_batch, word_reverse_dict,
label_reverse_dict)
test_batch = [] test_batch = []
if len(test_batch): if len(test_batch):
_infer_a_batch(inferer, test_batch, word_reverse_dict) _infer_a_batch(inferer, test_batch, word_reverse_dict,
label_reverse_dict)
test_batch = [] test_batch = []
......
import paddle.v2 as paddle import paddle.v2 as paddle
from config import ModelConfig as conf
def cnn_cov_group(group_input, hidden_size): def cnn_cov_group(group_input, hidden_size):
"""
Covolution group definition
:param group_input: The input of this layer.
:type group_input: LayerOutput
:params hidden_size: Size of FC layer.
:type hidden_size: int
"""
conv3 = paddle.networks.sequence_conv_pool( conv3 = paddle.networks.sequence_conv_pool(
input=group_input, context_len=3, hidden_size=hidden_size) input=group_input, context_len=3, hidden_size=hidden_size)
conv4 = paddle.networks.sequence_conv_pool( conv4 = paddle.networks.sequence_conv_pool(
input=group_input, context_len=4, hidden_size=hidden_size) input=group_input, context_len=4, hidden_size=hidden_size)
output_group = paddle.layer.fc( linear_proj = paddle.layer.fc(
input=[conv3, conv4], input=[conv3, conv4],
size=hidden_size, size=hidden_size,
param_attr=paddle.attr.ParamAttr(name='_cov_value_weight'), param_attr=paddle.attr.ParamAttr(name='_cov_value_weight'),
bias_attr=paddle.attr.ParamAttr(name='_cov_value_bias'), bias_attr=paddle.attr.ParamAttr(name='_cov_value_bias'),
act=paddle.activation.Linear()) act=paddle.activation.Linear())
return output_group return linear_proj
def nest_net(dict_dim, def nested_net(dict_dim, class_num, is_infer=False):
emb_size=28, """
hidden_size=128, Nested network definition.
class_num=2, :param dict_dim: Size of word dictionary.
is_infer=False): :type dict_dim: int
:params class_num: Number of instance class.
:type class_num: int
:params is_infer: The boolean parameter
indicating inferring or training.
:type is_infer: bool
"""
data = paddle.layer.data( data = paddle.layer.data(
"word", paddle.data_type.integer_value_sub_sequence(dict_dim)) "word", paddle.data_type.integer_value_sub_sequence(dict_dim))
emb = paddle.layer.embedding(input=data, size=emb_size) emb = paddle.layer.embedding(input=data, size=conf.emb_size)
nest_group = paddle.layer.recurrent_group( nest_group = paddle.layer.recurrent_group(
input=[paddle.layer.SubsequenceInput(emb), hidden_size], input=[paddle.layer.SubsequenceInput(emb), conf.hidden_size],
step=cnn_cov_group) step=cnn_cov_group)
avg_pool = paddle.layer.pooling( avg_pool = paddle.layer.pooling(
input=nest_group, input=nest_group,
......
...@@ -155,7 +155,7 @@ def imdb_word_dict(): ...@@ -155,7 +155,7 @@ def imdb_word_dict():
re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150) re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
def train_reader(data_dir, word_dict): def train_reader(data_dir, word_dict, label_dict):
""" """
Reader interface for training data Reader interface for training data
...@@ -164,6 +164,8 @@ def train_reader(data_dir, word_dict): ...@@ -164,6 +164,8 @@ def train_reader(data_dir, word_dict):
:param word_dict: path of word dictionary, :param word_dict: path of word dictionary,
the dictionary must has a "UNK" in it. the dictionary must has a "UNK" in it.
:type word_dict: Python dict :type word_dict: Python dict
:param label_dict: path of label dictionary.
:type label_dict: Python dict
""" """
def reader(): def reader():
...@@ -187,7 +189,7 @@ def train_reader(data_dir, word_dict): ...@@ -187,7 +189,7 @@ def train_reader(data_dir, word_dict):
if sent_ids: if sent_ids:
doc_ids.append(sent_ids) doc_ids.append(sent_ids)
yield doc_ids, int(line_split[lbl_col]) yield doc_ids, label_dict[line_split[lbl_col]]
return reader return reader
......
...@@ -6,8 +6,9 @@ import click ...@@ -6,8 +6,9 @@ import click
import paddle.v2 as paddle import paddle.v2 as paddle
import reader import reader
from network_conf import nest_net from network_conf import nested_net
from utils import build_dict, load_dict, logger from utils import build_word_dict, build_label_dict, load_dict, logger
from config import TrainerConfig as conf
@click.command('train') @click.command('train')
...@@ -33,25 +34,21 @@ from utils import build_dict, load_dict, logger ...@@ -33,25 +34,21 @@ from utils import build_dict, load_dict, logger
"word dictionay will be built from " "word dictionay will be built from "
"the training data automatically.")) "the training data automatically."))
@click.option( @click.option(
"--class_num", type=int, default=2, help="The class number (default: 2).") "--label_dict_path",
@click.option( type=str,
"--batch_size", default=None,
type=int, help=("The path of label dictionary (default: None)."
default=32, "If this parameter is not set, imdb dataset will be used. "
help=("The number of training examples in one batch " "If this parameter is set, but the file does not exist, "
"(default: 32).")) "label dictionay will be built from "
@click.option( "the training data automatically."))
"--num_passes",
type=int,
default=10,
help="The number of passes to train (default: 10).")
@click.option( @click.option(
"--model_save_dir", "--model_save_dir",
type=str, type=str,
default="models", default="models",
help="The path to save the trained models (default: 'models').") help="The path to save the trained models (default: 'models').")
def train(train_data_dir, test_data_dir, word_dict_path, class_num, def train(train_data_dir, test_data_dir, word_dict_path, label_dict_path,
model_save_dir, batch_size, num_passes): model_save_dir):
""" """
:params train_data_path: path of training data, if this parameter :params train_data_path: path of training data, if this parameter
is not specified, imdb dataset will be used to run this example is not specified, imdb dataset will be used to run this example
...@@ -59,18 +56,18 @@ def train(train_data_dir, test_data_dir, word_dict_path, class_num, ...@@ -59,18 +56,18 @@ def train(train_data_dir, test_data_dir, word_dict_path, class_num,
:params test_data_path: path of testing data, if this parameter :params test_data_path: path of testing data, if this parameter
is not specified, imdb dataset will be used to run this example is not specified, imdb dataset will be used to run this example
:type test_data_path: str :type test_data_path: str
:params word_dict_path: path of training data, if this parameter :params word_dict_path: path of word dictionary, if this parameter
is not specified, imdb dataset will be used to run this example is not specified, imdb dataset will be used to run this example
:type word_dict_path: str :type word_dict_path: str
:params label_dict_path: path of label dictionary, if this parameter
is not specified, imdb dataset will be used to run this example
:type label_dict_path: str
:params model_save_dir: dir where models saved :params model_save_dir: dir where models saved
:type num_pass: str :type model_save_dir: str
:params batch_size: train batch size
:type num_pass: int
:params num_pass: train pass number
:type num_pass: int
""" """
if train_data_dir is not None: if train_data_dir is not None:
assert word_dict_path, ("The parameter train_data_dir, word_dict_path " assert word_dict_path and label_dict_path, (
"The parameter train_data_dir, word_dict_path, label_dict_path "
"should be set at the same time.") "should be set at the same time.")
if not os.path.exists(model_save_dir): if not os.path.exists(model_save_dir):
...@@ -84,7 +81,6 @@ def train(train_data_dir, test_data_dir, word_dict_path, class_num, ...@@ -84,7 +81,6 @@ def train(train_data_dir, test_data_dir, word_dict_path, class_num,
logger.info("Please wait to build the word dictionary ...") logger.info("Please wait to build the word dictionary ...")
word_dict = reader.imdb_word_dict() word_dict = reader.imdb_word_dict()
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
lambda: reader.imdb_train(word_dict), buf_size=1000), lambda: reader.imdb_train(word_dict), buf_size=1000),
...@@ -99,69 +95,79 @@ def train(train_data_dir, test_data_dir, word_dict_path, class_num, ...@@ -99,69 +95,79 @@ def train(train_data_dir, test_data_dir, word_dict_path, class_num,
# build the word dictionary to map the original string-typed # build the word dictionary to map the original string-typed
# words into integer-typed index # words into integer-typed index
build_dict( build_word_dict(
data_dir=train_data_dir, data_dir=train_data_dir,
save_path=word_dict_path, save_path=word_dict_path,
use_col=1, use_col=1,
cutoff_fre=0) cutoff_fre=0)
if not os.path.exists(label_dict_path):
logger.info(("Label dictionary is not given, the dictionary "
"is automatically built from the training data."))
# build the label dictionary to map the original string-typed
# label into integer-typed index
build_label_dict(
data_dir=train_data_dir, save_path=label_dict_path, use_col=0)
word_dict = load_dict(word_dict_path) word_dict = load_dict(word_dict_path)
class_num = class_num label_dict = load_dict(label_dict_path)
class_num = len(label_dict)
logger.info("Class number is : %d." % class_num) logger.info("Class number is : %d." % class_num)
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
reader.train_reader(train_data_dir, word_dict), buf_size=1000), reader.train_reader(train_data_dir, word_dict, label_dict),
batch_size=batch_size) buf_size=conf.buf_size),
batch_size=conf.batch_size)
if test_data_dir is not None: if test_data_dir is not None:
# here, because training and testing data share a same format, # here, because training and testing data share a same format,
# we still use the reader.train_reader to read the testing data. # we still use the reader.train_reader to read the testing data.
test_reader = paddle.batch( test_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
reader.train_reader(test_data_dir, word_dict), reader.train_reader(test_data_dir, word_dict, label_dict),
buf_size=1000), buf_size=conf.buf_size),
batch_size=batch_size) batch_size=conf.batch_size)
else: else:
test_reader = None test_reader = None
dict_dim = len(word_dict) dict_dim = len(word_dict)
emb_size = 28
hidden_size = 128
logger.info("Length of word dictionary is : %d." % (dict_dim)) logger.info("Length of word dictionary is : %d." % (dict_dim))
paddle.init(use_gpu=True, trainer_count=4) paddle.init(use_gpu=conf.use_gpu, trainer_count=conf.trainer_count)
# network config
cost, prob, label = nest_net(
dict_dim, emb_size, hidden_size, class_num, is_infer=False)
# create parameters
parameters = paddle.parameters.create(cost)
# create optimizer # create optimizer
adam_optimizer = paddle.optimizer.Adam( adam_optimizer = paddle.optimizer.Adam(
learning_rate=1e-3, learning_rate=conf.learning_rate,
regularization=paddle.optimizer.L2Regularization(rate=1e-3), regularization=paddle.optimizer.L2Regularization(
model_average=paddle.optimizer.ModelAverage(average_window=0.5)) rate=conf.l2_learning_rate),
model_average=paddle.optimizer.ModelAverage(
average_window=conf.average_window))
# create trainer # define network topology.
cost, prob, label = nested_net(dict_dim, class_num, is_infer=False)
# create all the trainable parameters.
parameters = paddle.parameters.create(cost)
# create the trainer instance.
trainer = paddle.trainer.SGD( trainer = paddle.trainer.SGD(
cost=cost, cost=cost,
extra_layers=paddle.evaluator.auc(input=prob, label=label), extra_layers=paddle.evaluator.auc(input=prob, label=label),
parameters=parameters, parameters=parameters,
update_equation=adam_optimizer) update_equation=adam_optimizer)
# begin training network # feeding dictionary
feeding = {"word": 0, "label": 1} feeding = {"word": 0, "label": 1}
def _event_handler(event): def _event_handler(event):
""" """
Define end batch and end pass event handler Define the end batch and the end pass event handler.
""" """
if isinstance(event, paddle.event.EndIteration): if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0: if event.batch_id % conf.log_period == 0:
logger.info("Pass %d, Batch %d, Cost %f, %s\n" % ( logger.info("Pass %d, Batch %d, Cost %f, %s\n" % (
event.pass_id, event.batch_id, event.cost, event.metrics)) event.pass_id, event.batch_id, event.cost, event.metrics))
...@@ -175,11 +181,12 @@ def train(train_data_dir, test_data_dir, word_dict_path, class_num, ...@@ -175,11 +181,12 @@ def train(train_data_dir, test_data_dir, word_dict_path, class_num,
event.pass_id), "w") as f: event.pass_id), "w") as f:
parameters.to_tar(f) parameters.to_tar(f)
# begin training network
trainer.train( trainer.train(
reader=train_reader, reader=train_reader,
event_handler=_event_handler, event_handler=_event_handler,
feeding=feeding, feeding=feeding,
num_passes=num_passes) num_passes=conf.num_passes)
logger.info("Training has finished.") logger.info("Training has finished.")
......
...@@ -6,7 +6,7 @@ logger = logging.getLogger("paddle") ...@@ -6,7 +6,7 @@ logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
def build_dict(data_dir, save_path, use_col=1, cutoff_fre=1): def build_word_dict(data_dir, save_path, use_col=1, cutoff_fre=1):
values = defaultdict(int) values = defaultdict(int)
for file_name in os.listdir(data_dir): for file_name in os.listdir(data_dir):
...@@ -32,6 +32,31 @@ def build_dict(data_dir, save_path, use_col=1, cutoff_fre=1): ...@@ -32,6 +32,31 @@ def build_dict(data_dir, save_path, use_col=1, cutoff_fre=1):
f.write("%s\t%d\n" % (v, count)) f.write("%s\t%d\n" % (v, count))
def build_label_dict(data_dir, save_path, use_col=0):
values = defaultdict(int)
for file_name in os.listdir(data_dir):
file_path = os.path.join(data_dir, file_name)
if not os.path.isfile(file_path):
continue
with open(file_path, "r") as fdata:
for line in fdata:
line_splits = line.strip().split("\t")
if len(line_splits) < use_col:
continue
values[line_splits[use_col]] += 1
with open(save_path, "w") as f:
for v, count in sorted(
values.iteritems(), key=lambda x: x[1], reverse=True):
f.write("%s\t%d\n" % (v, count))
def load_dict(dict_path): def load_dict(dict_path):
return dict((line.strip().split("\t")[0], idx) return dict((line.strip().split("\t")[0], idx)
for idx, line in enumerate(open(dict_path, "r").readlines())) for idx, line in enumerate(open(dict_path, "r").readlines()))
def load_reverse_dict(dict_path):
return dict((idx, line.strip().split("\t")[0])
for idx, line in enumerate(open(dict_path, "r").readlines()))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册