提交 c59d32be 编写于 作者: L liaogang

Merge branch 'develop' of https://github.com/PaddlePaddle/book into readme

......@@ -48,7 +48,16 @@ FROM ${paddle_image}:${paddle_tag}
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
COPY . /book
EOF
if [ -n ${http_proxy} ]; then
cat >> Dockerfile <<EOF
ENV http_proxy ${http_proxy}
ENV https_proxy ${http_proxy}
EOF
fi
cat >> Dockerfile <<EOF
RUN pip install -U nltk \
&& python /book/.tools/cache_dataset.py
......@@ -58,7 +67,7 @@ RUN ${update_mirror_cmd}
apt-get -y install gcc && \
apt-get -y clean && \
localedef -f UTF-8 -i en_US en_US.UTF-8 && \
pip install -U matplotlib jupyter numpy requests scipy
pip install -U pillow matplotlib jupyter numpy requests scipy
#convert md to ipynb
RUN /book/.tools/notedown.sh
......
......@@ -141,7 +141,8 @@ import paddle.v2 as paddle
def convolution_net(input_dim,
class_dim=2,
emb_dim=128,
hid_dim=128):
hid_dim=128,
is_predict=False):
data = paddle.layer.data("word",
paddle.data_type.integer_value_sequence(input_dim))
emb = paddle.layer.embedding(input=data, size=emb_dim)
......@@ -152,9 +153,12 @@ def convolution_net(input_dim,
output = paddle.layer.fc(input=[conv_3, conv_4],
size=class_dim,
act=paddle.activation.Softmax())
lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
cost = paddle.layer.classification_cost(input=output, label=lbl)
return cost
if not is_predict:
lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
cost = paddle.layer.classification_cost(input=output, label=lbl)
return cost
else:
return output
```
网络的输入`input_dim`表示的是词典的大小,`class_dim`表示类别数。这里,我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。
......@@ -165,7 +169,8 @@ def stacked_lstm_net(input_dim,
class_dim=2,
emb_dim=128,
hid_dim=512,
stacked_num=3):
stacked_num=3,
is_predict=False):
"""
A Wrapper for sentiment classification task.
This network uses bi-directional recurrent network,
......@@ -223,9 +228,12 @@ def stacked_lstm_net(input_dim,
bias_attr=bias_attr,
param_attr=para_attr)
lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
cost = paddle.layer.classification_cost(input=output, label=lbl)
return cost
if not is_predict:
lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
cost = paddle.layer.classification_cost(input=output, label=lbl)
return cost
else:
return output
```
网络的输入`stacked_num`表示的是LSTM的层数,需要是奇数,确保最高层LSTM正向。Paddle里面是通过一个fc和一个lstmemory来实现基于LSTM的循环神经网络。
......@@ -294,7 +302,7 @@ Paddle中提供了一系列优化算法的API,这里使用Adam优化算法。
### 训练
可以通过`paddle.trainer.SGD`构造一个sgd trainer,并调用`trainer.train`来训练模型。
可以通过`paddle.trainer.SGD`构造一个sgd trainer,并调用`trainer.train`来训练模型。另外,通过给train函数传递一个`event_handler`来获取每个batch和每个pass结束的状态。
```python
# End batch and end pass event handler
def event_handler(event):
......@@ -309,7 +317,21 @@ Paddle中提供了一系列优化算法的API,这里使用Adam优化算法。
result = trainer.test(reader=test_reader, feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
```
可以通过给train函数传递一个`event_handler`来获取每个batch和每个pass结束的状态。比如构造如下一个`event_handler`可以在每100个batch结束后输出cost和error;在每个pass结束后调用`trainer.test`计算一遍测试集并获得当前模型在测试集上的error。
比如,构造如下一个`event_handler`可以在每100个batch结束后输出cost和error;在每个pass结束后调用`trainer.test`计算一遍测试集并获得当前模型在测试集上的error。
```python
from paddle.v2.plot import Ploter
train_title = "Train cost"
cost_ploter = Ploter(train_title)
step = 0
def event_handler_plot(event):
global step
if isinstance(event, paddle.event.EndIteration):
cost_ploter.append(train_title, step, event.cost)
cost_ploter.plot()
step += 1
```
或者构造一个`event_handler_plot`画出cost曲线。
```python
# create trainer
trainer = paddle.trainer.SGD(cost=cost,
......@@ -331,6 +353,36 @@ Pass 0, Batch 100, Cost 0.294321, {'classification_error_evaluator': 0.1015625}
Test with Pass 0, {'classification_error_evaluator': 0.11432000249624252}
```
## 应用模型
可以使用训练好的模型对电影评论进行分类,下面程序展示了如何使用`paddle.infer`接口进行推断。
```python
import numpy as np
# Movie Reviews, from imdb test
reviews = [
'Read the book, forget the movie!',
'This is a great movie.'
]
reviews = [c.split() for c in reviews]
UNK = word_dict['<unk>']
input = []
for c in reviews:
input.append([[word_dict.get(words, UNK) for words in c]])
# 0 stands for positive sample, 1 stands for negative sample
label = {0:'pos', 1:'neg'}
# Use the network used by trainer
out = convolution_net(dict_dim, class_dim=class_dim, is_predict=True)
# out = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3, is_predict=True)
probs = paddle.infer(output_layer=out, parameters=parameters, input=input)
labs = np.argsort(-probs)
for idx, lab in enumerate(labs):
print idx, "predicting probability is", probs[idx], "label is", label[lab[0]]
```
## 总结
本章我们以情感分析为例,介绍了使用深度学习的方法进行端对端的短文本分类,并且使用PaddlePaddle完成了全部相关实验。同时,我们简要介绍了两种文本处理模型:卷积神经网络和循环神经网络。在后续的章节中我们会看到这两种基本的深度学习模型在其它任务上的应用。
......
......@@ -183,7 +183,8 @@ import paddle.v2 as paddle
def convolution_net(input_dim,
class_dim=2,
emb_dim=128,
hid_dim=128):
hid_dim=128,
is_predict=False):
data = paddle.layer.data("word",
paddle.data_type.integer_value_sequence(input_dim))
emb = paddle.layer.embedding(input=data, size=emb_dim)
......@@ -194,9 +195,12 @@ def convolution_net(input_dim,
output = paddle.layer.fc(input=[conv_3, conv_4],
size=class_dim,
act=paddle.activation.Softmax())
lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
cost = paddle.layer.classification_cost(input=output, label=lbl)
return cost
if not is_predict:
lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
cost = paddle.layer.classification_cost(input=output, label=lbl)
return cost
else:
return output
```
网络的输入`input_dim`表示的是词典的大小,`class_dim`表示类别数。这里,我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。
......@@ -207,7 +211,8 @@ def stacked_lstm_net(input_dim,
class_dim=2,
emb_dim=128,
hid_dim=512,
stacked_num=3):
stacked_num=3,
is_predict=False):
"""
A Wrapper for sentiment classification task.
This network uses bi-directional recurrent network,
......@@ -265,9 +270,12 @@ def stacked_lstm_net(input_dim,
bias_attr=bias_attr,
param_attr=para_attr)
lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
cost = paddle.layer.classification_cost(input=output, label=lbl)
return cost
if not is_predict:
lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
cost = paddle.layer.classification_cost(input=output, label=lbl)
return cost
else:
return output
```
网络的输入`stacked_num`表示的是LSTM的层数,需要是奇数,确保最高层LSTM正向。Paddle里面是通过一个fc和一个lstmemory来实现基于LSTM的循环神经网络。
......@@ -336,7 +344,7 @@ Paddle中提供了一系列优化算法的API,这里使用Adam优化算法。
### 训练
可以通过`paddle.trainer.SGD`构造一个sgd trainer,并调用`trainer.train`来训练模型。
可以通过`paddle.trainer.SGD`构造一个sgd trainer,并调用`trainer.train`来训练模型。另外,通过给train函数传递一个`event_handler`来获取每个batch和每个pass结束的状态。
```python
# End batch and end pass event handler
def event_handler(event):
......@@ -351,7 +359,21 @@ Paddle中提供了一系列优化算法的API,这里使用Adam优化算法。
result = trainer.test(reader=test_reader, feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
```
可以通过给train函数传递一个`event_handler`来获取每个batch和每个pass结束的状态。比如构造如下一个`event_handler`可以在每100个batch结束后输出cost和error;在每个pass结束后调用`trainer.test`计算一遍测试集并获得当前模型在测试集上的error。
比如,构造如下一个`event_handler`可以在每100个batch结束后输出cost和error;在每个pass结束后调用`trainer.test`计算一遍测试集并获得当前模型在测试集上的error。
```python
from paddle.v2.plot import Ploter
train_title = "Train cost"
cost_ploter = Ploter(train_title)
step = 0
def event_handler_plot(event):
global step
if isinstance(event, paddle.event.EndIteration):
cost_ploter.append(train_title, step, event.cost)
cost_ploter.plot()
step += 1
```
或者构造一个`event_handler_plot`画出cost曲线。
```python
# create trainer
trainer = paddle.trainer.SGD(cost=cost,
......@@ -373,6 +395,36 @@ Pass 0, Batch 100, Cost 0.294321, {'classification_error_evaluator': 0.1015625}
Test with Pass 0, {'classification_error_evaluator': 0.11432000249624252}
```
## 应用模型
可以使用训练好的模型对电影评论进行分类,下面程序展示了如何使用`paddle.infer`接口进行推断。
```python
import numpy as np
# Movie Reviews, from imdb test
reviews = [
'Read the book, forget the movie!',
'This is a great movie.'
]
reviews = [c.split() for c in reviews]
UNK = word_dict['<unk>']
input = []
for c in reviews:
input.append([[word_dict.get(words, UNK) for words in c]])
# 0 stands for positive sample, 1 stands for negative sample
label = {0:'pos', 1:'neg'}
# Use the network used by trainer
out = convolution_net(dict_dim, class_dim=class_dim, is_predict=True)
# out = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3, is_predict=True)
probs = paddle.infer(output_layer=out, parameters=parameters, input=input)
labs = np.argsort(-probs)
for idx, lab in enumerate(labs):
print idx, "predicting probability is", probs[idx], "label is", label[lab[0]]
```
## 总结
本章我们以情感分析为例,介绍了使用深度学习的方法进行端对端的短文本分类,并且使用PaddlePaddle完成了全部相关实验。同时,我们简要介绍了两种文本处理模型:卷积神经网络和循环神经网络。在后续的章节中我们会看到这两种基本的深度学习模型在其它任务上的应用。
......
......@@ -214,6 +214,7 @@ import numpy as np
import gzip
import paddle.v2 as paddle
import paddle.v2.dataset.conll05 as conll05
import paddle.v2.evaluator as evaluator
paddle.init(use_gpu=False, trainer_count=1)
......@@ -343,23 +344,23 @@ for i in range(1, depth):
input_tmp = [mix_hidden, lstm]
```
- We will concatenate the output of the top LSTM unit with its input, and project the result into a hidden layer. Then, we put a fully connected layer on top to get the final feature vector representation.
- In PaddlePaddle, state features and transition features of a CRF are implemented by a fully connected layer and a CRF layer seperately. The fully connected layer with linear activation learns the state features, here we use paddle.layer.mixed (paddle.layer.fc can be uesed as well), and the CRF layer in PaddlePaddle: paddle.layer.crf only learns the transition features, which is a cost layer and is the last layer of the network. paddle.layer.crf outputs the log probability of true tag sequence as the cost by given the input sequence and it requires the true tag sequence as target in the learning process.
```python
feature_out = paddle.layer.mixed(
size=label_dict_len,
bias_attr=std_default,
input=[
paddle.layer.full_matrix_projection(
input=input_tmp[0], param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=input_tmp[1], param_attr=lstm_para_attr)
], )
```
```python
- At the end of the network, we use CRF as the cost function; the parameter of CRF cost will be named `crfw`.
# The output of the top LSTM unit and its input are feed into a fully connected layer,
# size of which equals to size of tag labels.
# The fully connected layer learns the state features
feature_out = paddle.layer.mixed(
size=label_dict_len,
bias_attr=std_default,
input=[
paddle.layer.full_matrix_projection(
input=input_tmp[0], param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=input_tmp[1], param_attr=lstm_para_attr)], )
```python
crf_cost = paddle.layer.crf(
size=label_dict_len,
input=feature_out,
......@@ -370,7 +371,7 @@ crf_cost = paddle.layer.crf(
learning_rate=mix_hidden_lr))
```
- The CRF decoding layer is used for evaluation and inference. It shares weights with CRF layer. The sharing of parameters among multiple layers is specified by using the same parameter name in these layers.
- The CRF decoding layer is used for evaluation and inference. It shares weights with CRF layer. The sharing of parameters among multiple layers is specified by using the same parameter name in these layers. If true tag sequence is provided in training process, `paddle.layer.crf_decoding` calculates labelling error for each input token and `evaluator.sum` sum the error over the entire sequence. Otherwise, `paddle.layer.crf_decoding` generates the labelling tags.
```python
crf_dec = paddle.layer.crf_decoding(
......@@ -414,7 +415,7 @@ We will create trainer given model topology, parameters, and optimization method
```python
optimizer = paddle.optimizer.Momentum(
momentum=0,
learning_rate=2e-2,
learning_rate=1e-3,
regularization=paddle.optimizer.L2Regularization(rate=8e-4),
model_average=paddle.optimizer.ModelAverage(
average_window=0.5, max_average_window=10000), )
......@@ -432,7 +433,7 @@ As mentioned in data preparation section, we will use CoNLL 2005 test corpus as
```python
reader = paddle.batch(
paddle.reader.shuffle(
conll05.test(), buf_size=8192), batch_size=20)
conll05.test(), buf_size=8192), batch_size=2)
```
`feeding` is used to specify the correspondence between data instance and data layer. For example, according to following `feeding`, the 0th column of data instance produced by`conll05.test()` is matched to the data layer named `word_data`.
......@@ -456,17 +457,17 @@ feeding = {
```python
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
if event.batch_id and event.batch_id % 10 == 0:
print "Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
if event.batch_id % 1000 == 0:
if event.batch_id % 400 == 0:
result = trainer.test(reader=reader, feeding=feeding)
print "\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics)
if isinstance(event, paddle.event.EndPass):
# save parameters
with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
parameters.to_tar(f)
parameters.to_tar(f)
result = trainer.test(reader=reader, feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
......
......@@ -192,6 +192,7 @@ import numpy as np
import gzip
import paddle.v2 as paddle
import paddle.v2.dataset.conll05 as conll05
import paddle.v2.evaluator as evaluator
paddle.init(use_gpu=False, trainer_count=1)
......@@ -274,12 +275,12 @@ emb_layers.append(mark_embedding)
```python
hidden_0 = paddle.layer.mixed(
size=hidden_dim,
bias_attr=std_default,
input=[
paddle.layer.full_matrix_projection(
input=emb, param_attr=std_default) for emb in emb_layers
])
size=hidden_dim,
bias_attr=std_default,
input=[
paddle.layer.full_matrix_projection(
input=emb, param_attr=std_default) for emb in emb_layers
])
mix_hidden_lr = 1e-3
lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
......@@ -320,23 +321,24 @@ for i in range(1, depth):
input_tmp = [mix_hidden, lstm]
```
- 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射,经过一个全连接层映射到标记字典的维度,得到最终的特征向量表示
- 在PaddlePaddle中,CRF的状态特征和转移特征分别由一个全连接层和一个PaddlePaddle中的CRF层分别学习。在这个例子中,我们用线性激活的paddle.layer.mixed 来学习CRF的状态特征(也可以使用paddle.layer.fc),而 paddle.layer.crf只学习转移特征。paddle.layer.crf层是一个 cost 层,处于整个网络的末端,输出给定输入序列下,标记序列的log probability作为代价。训练阶段,该层需要输入正确的标记序列作为学习目标
```python
feature_out = paddle.layer.mixed(
size=label_dict_len,
bias_attr=std_default,
input=[
paddle.layer.full_matrix_projection(
input=input_tmp[0], param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=input_tmp[1], param_attr=lstm_para_attr)
], )
```
- 网络的末端定义CRF层计算损失(cost),指定参数名字为 `crfw`,该层需要输入正确的数据标签(target)。
# 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射,
# 经过一个全连接层映射到标记字典的维度,来学习 CRF 的状态特征
```python
feature_out = paddle.layer.mixed(
size=label_dict_len,
bias_attr=std_default,
input=[
paddle.layer.full_matrix_projection(
input=input_tmp[0], param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=input_tmp[1], param_attr=lstm_para_attr)
], )
# 学习 CRF 的转移特征
crf_cost = paddle.layer.crf(
size=label_dict_len,
input=feature_out,
......@@ -347,7 +349,7 @@ crf_cost = paddle.layer.crf(
learning_rate=mix_hidden_lr))
```
- CRF译码层和CRF层参数名字相同,即共享权重。如果输入了正确的数据标签(target),会统计错误标签的个数,可以用来评估模型。如果没有输入正确的数据标签,该层可以推到出最优解,可以用来预测模型
- CRF解码和CRF层参数名字相同,即:加载了`paddle.layer.crf`层学习到的参数。在训练阶段,为`paddle.layer.crf_decoding` 输入了正确的标记序列(target),这一层会输出是否正确标记,`evaluator.sum` 用来计算序列上的标记错误率,可以用来评估模型。解码阶段,没有输入正确的数据标签,该层通过寻找概率最高的标记序列,解码出标记结果
```python
crf_dec = paddle.layer.crf_decoding(
......@@ -394,7 +396,7 @@ parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
# create optimizer
optimizer = paddle.optimizer.Momentum(
momentum=0,
learning_rate=2e-2,
learning_rate=1e-3,
regularization=paddle.optimizer.L2Regularization(rate=8e-4),
model_average=paddle.optimizer.ModelAverage(
average_window=0.5, max_average_window=10000), )
......@@ -412,7 +414,7 @@ trainer = paddle.trainer.SGD(cost=crf_cost,
```python
reader = paddle.batch(
paddle.reader.shuffle(
conll05.test(), buf_size=8192), batch_size=20)
conll05.test(), buf_size=8192), batch_size=2)
```
通过`feeding`来指定每一个数据和data_layer的对应关系。 例如 下面`feeding`表示: `conll05.test()`产生数据的第0列对应`word_data`层的特征。
......@@ -437,17 +439,17 @@ feeding = {
```python
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
if event.batch_id and event.batch_id % 10 == 0:
print "Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
if event.batch_id % 1000 == 0:
if event.batch_id % 400 == 0:
result = trainer.test(reader=reader, feeding=feeding)
print "\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics)
if isinstance(event, paddle.event.EndPass):
# save parameters
with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
parameters.to_tar(f)
parameters.to_tar(f)
result = trainer.test(reader=reader, feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
......@@ -459,7 +461,7 @@ def event_handler(event):
trainer.train(
reader=reader,
event_handler=event_handler,
num_passes=10000,
num_passes=1,
feeding=feeding)
```
......
......@@ -256,6 +256,7 @@ import numpy as np
import gzip
import paddle.v2 as paddle
import paddle.v2.dataset.conll05 as conll05
import paddle.v2.evaluator as evaluator
paddle.init(use_gpu=False, trainer_count=1)
......@@ -385,23 +386,23 @@ for i in range(1, depth):
input_tmp = [mix_hidden, lstm]
```
- We will concatenate the output of the top LSTM unit with its input, and project the result into a hidden layer. Then, we put a fully connected layer on top to get the final feature vector representation.
- In PaddlePaddle, state features and transition features of a CRF are implemented by a fully connected layer and a CRF layer seperately. The fully connected layer with linear activation learns the state features, here we use paddle.layer.mixed (paddle.layer.fc can be uesed as well), and the CRF layer in PaddlePaddle: paddle.layer.crf only learns the transition features, which is a cost layer and is the last layer of the network. paddle.layer.crf outputs the log probability of true tag sequence as the cost by given the input sequence and it requires the true tag sequence as target in the learning process.
```python
feature_out = paddle.layer.mixed(
size=label_dict_len,
bias_attr=std_default,
input=[
paddle.layer.full_matrix_projection(
input=input_tmp[0], param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=input_tmp[1], param_attr=lstm_para_attr)
], )
```
```python
- At the end of the network, we use CRF as the cost function; the parameter of CRF cost will be named `crfw`.
# The output of the top LSTM unit and its input are feed into a fully connected layer,
# size of which equals to size of tag labels.
# The fully connected layer learns the state features
feature_out = paddle.layer.mixed(
size=label_dict_len,
bias_attr=std_default,
input=[
paddle.layer.full_matrix_projection(
input=input_tmp[0], param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=input_tmp[1], param_attr=lstm_para_attr)], )
```python
crf_cost = paddle.layer.crf(
size=label_dict_len,
input=feature_out,
......@@ -412,7 +413,7 @@ crf_cost = paddle.layer.crf(
learning_rate=mix_hidden_lr))
```
- The CRF decoding layer is used for evaluation and inference. It shares weights with CRF layer. The sharing of parameters among multiple layers is specified by using the same parameter name in these layers.
- The CRF decoding layer is used for evaluation and inference. It shares weights with CRF layer. The sharing of parameters among multiple layers is specified by using the same parameter name in these layers. If true tag sequence is provided in training process, `paddle.layer.crf_decoding` calculates labelling error for each input token and `evaluator.sum` sum the error over the entire sequence. Otherwise, `paddle.layer.crf_decoding` generates the labelling tags.
```python
crf_dec = paddle.layer.crf_decoding(
......@@ -456,7 +457,7 @@ We will create trainer given model topology, parameters, and optimization method
```python
optimizer = paddle.optimizer.Momentum(
momentum=0,
learning_rate=2e-2,
learning_rate=1e-3,
regularization=paddle.optimizer.L2Regularization(rate=8e-4),
model_average=paddle.optimizer.ModelAverage(
average_window=0.5, max_average_window=10000), )
......@@ -474,7 +475,7 @@ As mentioned in data preparation section, we will use CoNLL 2005 test corpus as
```python
reader = paddle.batch(
paddle.reader.shuffle(
conll05.test(), buf_size=8192), batch_size=20)
conll05.test(), buf_size=8192), batch_size=2)
```
`feeding` is used to specify the correspondence between data instance and data layer. For example, according to following `feeding`, the 0th column of data instance produced by`conll05.test()` is matched to the data layer named `word_data`.
......@@ -498,17 +499,17 @@ feeding = {
```python
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
if event.batch_id and event.batch_id % 10 == 0:
print "Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
if event.batch_id % 1000 == 0:
if event.batch_id % 400 == 0:
result = trainer.test(reader=reader, feeding=feeding)
print "\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics)
if isinstance(event, paddle.event.EndPass):
# save parameters
with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
parameters.to_tar(f)
parameters.to_tar(f)
result = trainer.test(reader=reader, feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
......
......@@ -234,6 +234,7 @@ import numpy as np
import gzip
import paddle.v2 as paddle
import paddle.v2.dataset.conll05 as conll05
import paddle.v2.evaluator as evaluator
paddle.init(use_gpu=False, trainer_count=1)
......@@ -316,12 +317,12 @@ emb_layers.append(mark_embedding)
```python
hidden_0 = paddle.layer.mixed(
size=hidden_dim,
bias_attr=std_default,
input=[
paddle.layer.full_matrix_projection(
input=emb, param_attr=std_default) for emb in emb_layers
])
size=hidden_dim,
bias_attr=std_default,
input=[
paddle.layer.full_matrix_projection(
input=emb, param_attr=std_default) for emb in emb_layers
])
mix_hidden_lr = 1e-3
lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
......@@ -362,23 +363,24 @@ for i in range(1, depth):
input_tmp = [mix_hidden, lstm]
```
- 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射,经过一个全连接层映射到标记字典的维度,得到最终的特征向量表示
- 在PaddlePaddle中,CRF的状态特征和转移特征分别由一个全连接层和一个PaddlePaddle中的CRF层分别学习。在这个例子中,我们用线性激活的paddle.layer.mixed 来学习CRF的状态特征(也可以使用paddle.layer.fc),而 paddle.layer.crf只学习转移特征。paddle.layer.crf层是一个 cost 层,处于整个网络的末端,输出给定输入序列下,标记序列的log probability作为代价。训练阶段,该层需要输入正确的标记序列作为学习目标
```python
feature_out = paddle.layer.mixed(
size=label_dict_len,
bias_attr=std_default,
input=[
paddle.layer.full_matrix_projection(
input=input_tmp[0], param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=input_tmp[1], param_attr=lstm_para_attr)
], )
```
- 网络的末端定义CRF层计算损失(cost),指定参数名字为 `crfw`,该层需要输入正确的数据标签(target)。
# 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射,
# 经过一个全连接层映射到标记字典的维度,来学习 CRF 的状态特征
```python
feature_out = paddle.layer.mixed(
size=label_dict_len,
bias_attr=std_default,
input=[
paddle.layer.full_matrix_projection(
input=input_tmp[0], param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=input_tmp[1], param_attr=lstm_para_attr)
], )
# 学习 CRF 的转移特征
crf_cost = paddle.layer.crf(
size=label_dict_len,
input=feature_out,
......@@ -389,7 +391,7 @@ crf_cost = paddle.layer.crf(
learning_rate=mix_hidden_lr))
```
- CRF译码层和CRF层参数名字相同,即共享权重。如果输入了正确的数据标签(target),会统计错误标签的个数,可以用来评估模型。如果没有输入正确的数据标签,该层可以推到出最优解,可以用来预测模型
- CRF解码和CRF层参数名字相同,即:加载了`paddle.layer.crf`层学习到的参数。在训练阶段,为`paddle.layer.crf_decoding` 输入了正确的标记序列(target),这一层会输出是否正确标记,`evaluator.sum` 用来计算序列上的标记错误率,可以用来评估模型。解码阶段,没有输入正确的数据标签,该层通过寻找概率最高的标记序列,解码出标记结果
```python
crf_dec = paddle.layer.crf_decoding(
......@@ -436,7 +438,7 @@ parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
# create optimizer
optimizer = paddle.optimizer.Momentum(
momentum=0,
learning_rate=2e-2,
learning_rate=1e-3,
regularization=paddle.optimizer.L2Regularization(rate=8e-4),
model_average=paddle.optimizer.ModelAverage(
average_window=0.5, max_average_window=10000), )
......@@ -454,7 +456,7 @@ trainer = paddle.trainer.SGD(cost=crf_cost,
```python
reader = paddle.batch(
paddle.reader.shuffle(
conll05.test(), buf_size=8192), batch_size=20)
conll05.test(), buf_size=8192), batch_size=2)
```
通过`feeding`来指定每一个数据和data_layer的对应关系。 例如 下面`feeding`表示: `conll05.test()`产生数据的第0列对应`word_data`层的特征。
......@@ -479,17 +481,17 @@ feeding = {
```python
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
if event.batch_id and event.batch_id % 10 == 0:
print "Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
if event.batch_id % 1000 == 0:
if event.batch_id % 400 == 0:
result = trainer.test(reader=reader, feeding=feeding)
print "\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics)
if isinstance(event, paddle.event.EndPass):
# save parameters
with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
parameters.to_tar(f)
parameters.to_tar(f)
result = trainer.test(reader=reader, feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
......@@ -501,7 +503,7 @@ def event_handler(event):
trainer.train(
reader=reader,
event_handler=event_handler,
num_passes=10000,
num_passes=1,
feeding=feeding)
```
......
此差异已折叠。
此差异已折叠。
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
set -x
mkdir wmt14
cd wmt14
# download the dataset
wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz
wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz
# untar the dataset
tar -zxvf bitexts.tgz
tar -zxvf dev+test.tgz
gunzip bitexts.selected/*
mv bitexts.selected train
rm bitexts.tgz
rm dev+test.tgz
# separate the dev and test dataset
mkdir test gen
mv dev/ntst1213.* test
mv dev/ntst14.* gen
rm -rf dev
set +x
# rename the suffix, .fr->.src, .en->.trg
for dir in train test gen
do
filelist=`ls $dir`
cd $dir
for file in $filelist
do
if [ ${file##*.} = "fr" ]; then
mv $file ${file/%fr/src}
elif [ ${file##*.} = 'en' ]; then
mv $file ${file/%en/trg}
fi
done
cd ..
done
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
gen_file=$1
beam_size=$2
# find top1 generating result
top1=$(printf '%s_top1.txt' `basename $gen_file .txt`)
if [ $beam_size -eq 1 ]; then
awk -F "\t" '{sub(" <e>","",$2);sub(" ","",$2);print $2}' $gen_file >$top1
else
awk 'BEGIN{
FS="\t";
OFS="\t";
read_pos = 2} {
if (NR == read_pos){
sub(" <e>","",$3);
sub(" ","",$3);
print $3;
read_pos += (2 + res_num);
}}' res_num=$beam_size $gen_file >$top1
fi
# evalute bleu value
bleu_script=multi-bleu.perl
standard_res=data/wmt14/gen/ntst14.trg
bleu_res=`perl $bleu_script $standard_res <$top1`
echo $bleu_res | cut -d, -f 1
rm $top1
此差异已折叠。
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
set -x
echo "Downloading multi-bleu.perl"
wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl --no-check-certificate
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
set -x
# download the pretrained model
wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz
# untar the model
tar -zxvf wmt14_model.tar.gz
rm wmt14_model.tar.gz
import sys
import paddle.v2 as paddle
def seqToseq_net(source_dict_dim, target_dict_dim):
def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
### Network Architecture
word_vector_dim = 512 # dimension of word vector
decoder_size = 512 # dimension of hidden unit in GRU Decoder network
encoder_size = 512 # dimension of hidden unit in GRU Encoder network
beam_size = 3
max_length = 250
#### Encoder
src_word_id = paddle.layer.data(
name='source_language_word',
......@@ -67,78 +71,142 @@ def seqToseq_net(source_dict_dim, target_dict_dim):
group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
group_inputs = [group_input1, group_input2]
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
if not is_generating:
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
else:
# In generation, the decoder predicts a next target word based on
# the encoded source sequence and the last generated target word.
# The encoded source sequence (encoder's output) must be specified by
# StaticInput, which is a read-only memory.
# Embedding of the last generated word is automatically gotten by
# GeneratedInputs, which is initialized by a start mark, such as <s>,
# and must be included in generation.
trg_embedding = paddle.layer.GeneratedInputV2(
size=target_dict_dim,
embedding_name='_target_language_embedding',
embedding_size=word_vector_dim)
group_inputs.append(trg_embedding)
beam_gen = paddle.layer.beam_search(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs,
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=max_length)
return beam_gen
def main():
paddle.init(use_gpu=False, trainer_count=1)
is_generating = False
# source and target dict dim.
dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
# define network topology
cost = seqToseq_net(source_dict_dim, target_dict_dim)
parameters = paddle.parameters.create(cost)
# define optimize method and trainer
optimizer = paddle.optimizer.Adam(
learning_rate=5e-5,
regularization=paddle.optimizer.L2Regularization(rate=8e-4))
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
# define data reader
feeding = {
'source_language_word': 0,
'target_language_word': 1,
'target_language_next_word': 2
}
wmt14_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
batch_size=5)
# define event_handler callback
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 10 == 0:
print "\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
# train the network
if not is_generating:
cost = seqToseq_net(source_dict_dim, target_dict_dim)
parameters = paddle.parameters.create(cost)
# define optimize method and trainer
optimizer = paddle.optimizer.Adam(
learning_rate=5e-5,
regularization=paddle.optimizer.L2Regularization(rate=8e-4))
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
# define data reader
wmt14_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=8192),
batch_size=5)
# define event_handler callback
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 10 == 0:
print "\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost,
event.metrics)
else:
sys.stdout.write('.')
sys.stdout.flush()
# start to train
trainer.train(
reader=wmt14_reader, event_handler=event_handler, num_passes=2)
# generate a english sequence to french
else:
# use the first 3 samples for generation
gen_creator = paddle.dataset.wmt14.gen(dict_size)
gen_data = []
gen_num = 3
for item in gen_creator():
gen_data.append((item[0], ))
if len(gen_data) == gen_num:
break
beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating)
# get the pretrained model, whose bleu = 26.92
parameters = paddle.dataset.wmt14.model()
# prob is the prediction probabilities, and id is the prediction word.
beam_result = paddle.infer(
output_layer=beam_gen,
parameters=parameters,
input=gen_data,
field=['prob', 'id'])
# get the dictionary
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
# the delimited element of generated sequences is -1,
# the first element of each generated sequence is the sequence length
seq_list = []
seq = []
for w in beam_result[1]:
if w != -1:
seq.append(w)
else:
sys.stdout.write('.')
sys.stdout.flush()
# start to train
trainer.train(
reader=wmt14_reader,
event_handler=event_handler,
num_passes=2,
feeding=feeding)
seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
seq = []
prob = beam_result[0]
beam_size = 3
for i in xrange(gen_num):
print "\n*******************************************************\n"
print "src:", ' '.join(
[src_dict.get(w) for w in gen_data[i][0]]), "\n"
for j in xrange(beam_size):
print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册