“3e702f6df8d30998f6e901ba9ed40915ecc4f914”上不存在“tools/git@gitcode.net:qq_37101384/mace.git”
提交 5e56527e 编写于 作者: J JiabinYang

Merge branch 'develop' of https://github.com/PaddlePaddle/book into book04_refine

...@@ -215,6 +215,7 @@ print "User %s rates Movie %s with Score %s"%(user_info[uid], movie_info[mov_id] ...@@ -215,6 +215,7 @@ print "User %s rates Movie %s with Score %s"%(user_info[uid], movie_info[mov_id]
```python ```python
from __future__ import print_function
import math import math
import sys import sys
import numpy as np import numpy as np
...@@ -508,6 +509,11 @@ results = inferencer.infer( ...@@ -508,6 +509,11 @@ results = inferencer.infer(
'movie_title': movie_title 'movie_title': movie_title
}, },
return_numpy=False) return_numpy=False)
predict_rating = np.array(results[0])
print("Predict Rating of user id 1 on movie \"" + infer_movie_name + "\" is " + str(predict_rating[0][0]))
print("Actual Rating of user id 1 on movie \"" + infer_movie_name + "\" is 4.")
``` ```
## 总结 ## 总结
......
...@@ -185,6 +185,7 @@ After issuing a command `python train.py`, training will start immediately. The ...@@ -185,6 +185,7 @@ After issuing a command `python train.py`, training will start immediately. The
Our program starts with importing necessary packages and initializing some global variables: Our program starts with importing necessary packages and initializing some global variables:
```python ```python
from __future__ import print_function
import math import math
import sys import sys
import numpy as np import numpy as np
......
...@@ -257,6 +257,7 @@ print "User %s rates Movie %s with Score %s"%(user_info[uid], movie_info[mov_id] ...@@ -257,6 +257,7 @@ print "User %s rates Movie %s with Score %s"%(user_info[uid], movie_info[mov_id]
```python ```python
from __future__ import print_function
import math import math
import sys import sys
import numpy as np import numpy as np
...@@ -550,6 +551,11 @@ results = inferencer.infer( ...@@ -550,6 +551,11 @@ results = inferencer.infer(
'movie_title': movie_title 'movie_title': movie_title
}, },
return_numpy=False) return_numpy=False)
predict_rating = np.array(results[0])
print("Predict Rating of user id 1 on movie \"" + infer_movie_name + "\" is " + str(predict_rating[0][0]))
print("Actual Rating of user id 1 on movie \"" + infer_movie_name + "\" is 4.")
``` ```
## 总结 ## 总结
......
...@@ -227,6 +227,7 @@ After issuing a command `python train.py`, training will start immediately. The ...@@ -227,6 +227,7 @@ After issuing a command `python train.py`, training will start immediately. The
Our program starts with importing necessary packages and initializing some global variables: Our program starts with importing necessary packages and initializing some global variables:
```python ```python
from __future__ import print_function
import math import math
import sys import sys
import numpy as np import numpy as np
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import math import math
import sys import sys
import numpy as np import numpy as np
...@@ -233,7 +234,11 @@ def infer(use_cuda, inference_program, params_dirname): ...@@ -233,7 +234,11 @@ def infer(use_cuda, inference_program, params_dirname):
}, },
return_numpy=False) return_numpy=False)
print("infer results: ", np.array(results[0])) predict_rating = np.array(results[0])
print("Predict Rating of user id 1 on movie \"" + infer_movie_name +
"\" is " + str(predict_rating[0][0]))
print("Actual Rating of user id 1 on movie \"" + infer_movie_name +
"\" is 4.")
def main(use_cuda): def main(use_cuda):
......
...@@ -107,6 +107,7 @@ Paddle在`dataset/imdb.py`中提实现了imdb数据集的自动下载和读取 ...@@ -107,6 +107,7 @@ Paddle在`dataset/imdb.py`中提实现了imdb数据集的自动下载和读取
在该示例中,我们实现了两种文本分类算法,分别基于[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过的文本卷积神经网络,以及[栈式双向LSTM](#栈式双向LSTM(Stacked Bidirectional LSTM))。我们首先引入要用到的库和定义全局变量: 在该示例中,我们实现了两种文本分类算法,分别基于[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过的文本卷积神经网络,以及[栈式双向LSTM](#栈式双向LSTM(Stacked Bidirectional LSTM))。我们首先引入要用到的库和定义全局变量:
```python ```python
from __future__ import print_function
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from functools import partial from functools import partial
...@@ -115,6 +116,7 @@ import numpy as np ...@@ -115,6 +116,7 @@ import numpy as np
CLASS_DIM = 2 CLASS_DIM = 2
EMB_DIM = 128 EMB_DIM = 128
HID_DIM = 512 HID_DIM = 512
STACKED_NUM = 3
BATCH_SIZE = 128 BATCH_SIZE = 128
USE_GPU = False USE_GPU = False
``` ```
...@@ -168,17 +170,12 @@ def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num): ...@@ -168,17 +170,12 @@ def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):
input=fc, size=hid_dim, is_reverse=(i % 2) == 0) input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
inputs = [fc, lstm] inputs = [fc, lstm]
fc_last = paddle.layer.pooling(input=inputs[0], pooling_type=paddle.pooling.Max()) fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
lstm_last = paddle.layer.pooling(input=inputs[1], pooling_type=paddle.pooling.Max()) lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
output = paddle.layer.fc(input=[fc_last, lstm_last],
size=class_dim, prediction = fluid.layers.fc(
act=paddle.activation.Softmax(), input=[fc_last, lstm_last], size=class_dim, act='softmax')
bias_attr=bias_attr, return prediction
param_attr=para_attr)
lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
cost = paddle.layer.classification_cost(input=output, label=lbl)
return cost, output
``` ```
以上的栈式双向LSTM抽象出了高级特征并把其映射到和分类类别数同样大小的向量上。`paddle.activation.Softmax`函数用来计算分类属于某个类别的概率。 以上的栈式双向LSTM抽象出了高级特征并把其映射到和分类类别数同样大小的向量上。`paddle.activation.Softmax`函数用来计算分类属于某个类别的概率。
...@@ -193,6 +190,7 @@ def inference_program(word_dict): ...@@ -193,6 +190,7 @@ def inference_program(word_dict):
dict_dim = len(word_dict) dict_dim = len(word_dict)
net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM) net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
# net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM)
return net return net
``` ```
...@@ -301,7 +299,7 @@ trainer.train( ...@@ -301,7 +299,7 @@ trainer.train(
```python ```python
inferencer = fluid.Inferencer( inferencer = fluid.Inferencer(
inference_program, param_path=params_dirname, place=place) infer_func=partial(inference_program, word_dict), param_path=params_dirname, place=place)
``` ```
### 生成测试用输入数据 ### 生成测试用输入数据
......
...@@ -103,6 +103,7 @@ After issuing a command `python train.py`, training will start immediately. The ...@@ -103,6 +103,7 @@ After issuing a command `python train.py`, training will start immediately. The
Our program starts with importing necessary packages and initializing some global variables: Our program starts with importing necessary packages and initializing some global variables:
```python ```python
from __future__ import print_function
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from functools import partial from functools import partial
...@@ -111,6 +112,7 @@ import numpy as np ...@@ -111,6 +112,7 @@ import numpy as np
CLASS_DIM = 2 CLASS_DIM = 2
EMB_DIM = 128 EMB_DIM = 128
HID_DIM = 512 HID_DIM = 512
STACKED_NUM = 3
BATCH_SIZE = 128 BATCH_SIZE = 128
USE_GPU = False USE_GPU = False
``` ```
...@@ -192,6 +194,7 @@ def inference_program(word_dict): ...@@ -192,6 +194,7 @@ def inference_program(word_dict):
dict_dim = len(word_dict) dict_dim = len(word_dict)
net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM) net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
# net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM)
return net return net
``` ```
......
...@@ -149,6 +149,7 @@ Paddle在`dataset/imdb.py`中提实现了imdb数据集的自动下载和读取 ...@@ -149,6 +149,7 @@ Paddle在`dataset/imdb.py`中提实现了imdb数据集的自动下载和读取
在该示例中,我们实现了两种文本分类算法,分别基于[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过的文本卷积神经网络,以及[栈式双向LSTM](#栈式双向LSTM(Stacked Bidirectional LSTM))。我们首先引入要用到的库和定义全局变量: 在该示例中,我们实现了两种文本分类算法,分别基于[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过的文本卷积神经网络,以及[栈式双向LSTM](#栈式双向LSTM(Stacked Bidirectional LSTM))。我们首先引入要用到的库和定义全局变量:
```python ```python
from __future__ import print_function
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from functools import partial from functools import partial
...@@ -157,6 +158,7 @@ import numpy as np ...@@ -157,6 +158,7 @@ import numpy as np
CLASS_DIM = 2 CLASS_DIM = 2
EMB_DIM = 128 EMB_DIM = 128
HID_DIM = 512 HID_DIM = 512
STACKED_NUM = 3
BATCH_SIZE = 128 BATCH_SIZE = 128
USE_GPU = False USE_GPU = False
``` ```
...@@ -210,17 +212,12 @@ def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num): ...@@ -210,17 +212,12 @@ def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):
input=fc, size=hid_dim, is_reverse=(i % 2) == 0) input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
inputs = [fc, lstm] inputs = [fc, lstm]
fc_last = paddle.layer.pooling(input=inputs[0], pooling_type=paddle.pooling.Max()) fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
lstm_last = paddle.layer.pooling(input=inputs[1], pooling_type=paddle.pooling.Max()) lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
output = paddle.layer.fc(input=[fc_last, lstm_last],
size=class_dim, prediction = fluid.layers.fc(
act=paddle.activation.Softmax(), input=[fc_last, lstm_last], size=class_dim, act='softmax')
bias_attr=bias_attr, return prediction
param_attr=para_attr)
lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
cost = paddle.layer.classification_cost(input=output, label=lbl)
return cost, output
``` ```
以上的栈式双向LSTM抽象出了高级特征并把其映射到和分类类别数同样大小的向量上。`paddle.activation.Softmax`函数用来计算分类属于某个类别的概率。 以上的栈式双向LSTM抽象出了高级特征并把其映射到和分类类别数同样大小的向量上。`paddle.activation.Softmax`函数用来计算分类属于某个类别的概率。
...@@ -235,6 +232,7 @@ def inference_program(word_dict): ...@@ -235,6 +232,7 @@ def inference_program(word_dict):
dict_dim = len(word_dict) dict_dim = len(word_dict)
net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM) net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
# net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM)
return net return net
``` ```
...@@ -343,7 +341,7 @@ trainer.train( ...@@ -343,7 +341,7 @@ trainer.train(
```python ```python
inferencer = fluid.Inferencer( inferencer = fluid.Inferencer(
inference_program, param_path=params_dirname, place=place) infer_func=partial(inference_program, word_dict), param_path=params_dirname, place=place)
``` ```
### 生成测试用输入数据 ### 生成测试用输入数据
......
...@@ -145,6 +145,7 @@ After issuing a command `python train.py`, training will start immediately. The ...@@ -145,6 +145,7 @@ After issuing a command `python train.py`, training will start immediately. The
Our program starts with importing necessary packages and initializing some global variables: Our program starts with importing necessary packages and initializing some global variables:
```python ```python
from __future__ import print_function
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from functools import partial from functools import partial
...@@ -153,6 +154,7 @@ import numpy as np ...@@ -153,6 +154,7 @@ import numpy as np
CLASS_DIM = 2 CLASS_DIM = 2
EMB_DIM = 128 EMB_DIM = 128
HID_DIM = 512 HID_DIM = 512
STACKED_NUM = 3
BATCH_SIZE = 128 BATCH_SIZE = 128
USE_GPU = False USE_GPU = False
``` ```
...@@ -234,6 +236,7 @@ def inference_program(word_dict): ...@@ -234,6 +236,7 @@ def inference_program(word_dict):
dict_dim = len(word_dict) dict_dim = len(word_dict)
net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM) net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
# net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM)
return net return net
``` ```
......
...@@ -25,7 +25,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb ...@@ -25,7 +25,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
图1. 依存句法分析句法树示例 图1. 依存句法分析句法树示例
</div> </div>
然而,完全句法分析需要确定句子所包含的全部句法信息,并确定句子各成分之间的关系,是一个非常困难的任务,目前技术下的句法分析准确率并不高,句法分析的细微错误都会导致SRL的错误。为了降低问题的复杂度,同时获得一定的句法结构信息,“浅层句法分析”的思想应运而生。浅层句法分析也称为部分句法分析(partial parsing)或语块划分(chunking)。和完全句法分析得到一颗完整的句法树不同,浅层句法分析只需要识别句子中某些结构相对简单的独立成分,例如:动词短语,这些被识别出来的结构称为语块。为了回避 “无法获得准确率较高的句法树” 所带来的困难,一些研究\[[1](#参考文献)\]也提出了基于语块(chunk)的SRL方法。基于语块的SRL方法将SRL作为一个序列标注问题来解决。序列标注任务一般都会采用BIO表示方式来定义序列标注的标签集,我们先来介绍这种表示方法。在BIO表示法中,B代表语块的开始,I代表语块的中间,O代表语块结束。通过B、I、O 三种标记将不同的语块赋予不同的标签,例如:对于一个角色为A的论元,将它所包含的第一个语块赋予标签B-A,将它所包含的其它语块赋予标签I-A,不属于任何论元的语块赋予标签O。 然而,完全句法分析需要确定句子所包含的全部句法信息,并确定句子各成分之间的关系,是一个非常困难的任务,目前技术下的句法分析准确率并不高,句法分析的细微错误都会导致SRL的错误。为了降低问题的复杂度,同时获得一定的句法结构信息,“浅层句法分析”的思想应运而生。浅层句法分析也称为部分句法分析(partial parsing)或语块划分(chunking)。和完全句法分析得到一颗完整的句法树不同,浅层句法分析只需要识别句子中某些结构相对简单的独立成分,例如:动词短语,这些被识别出来的结构称为语块。为了回避 “无法获得准确率较高的句法树” 所带来的困难,一些研究\[[1](#参考文献)\]也提出了基于语块(chunk)的SRL方法。基于语块的SRL方法将SRL作为一个序列标注问题来解决。序列标注任务一般都会采用BIO表示方式来定义序列标注的标签集,我们先来介绍这种表示方法。在BIO表示法中,B代表语块的开始,I代表语块的中间,O代表语块结束。通过B、I、O 三种标记将不同的语块赋予不同的标签,例如:对于一个由角色A拓展得到的语块组,将它所包含的第一个语块赋予标签B-A,将它所包含的其它语块赋予标签I-A,不属于任何论元的语块赋予标签O。
我们继续以上面的这句话为例,图1展示了BIO表示方法。 我们继续以上面的这句话为例,图1展示了BIO表示方法。
...@@ -151,14 +151,6 @@ conll05st-release/ ...@@ -151,14 +151,6 @@ conll05st-release/
4. 构造以BIO法表示的标记; 4. 构造以BIO法表示的标记;
5. 依据词典获取词对应的整数索引。 5. 依据词典获取词对应的整数索引。
```python
# import paddle.v2.dataset.conll05 as conll05
# conll05.corpus_reader函数完成上面第1步和第2步.
# conll05.reader_creator函数完成上面第3步到第5步.
# conll05.test函数可以获取处理之后的每条样本来供PaddlePaddle训练.
```
预处理完成之后一条训练样本包含9个特征,分别是:句子序列、谓词、谓词上下文(占 5 列)、谓词上下区域标志、标注序列。下表是一条训练样本的示例。 预处理完成之后一条训练样本包含9个特征,分别是:句子序列、谓词、谓词上下文(占 5 列)、谓词上下区域标志、标注序列。下表是一条训练样本的示例。
| 句子序列 | 谓词 | 谓词上下文(窗口 = 5) | 谓词上下文区域标记 | 标注序列 | | 句子序列 | 谓词 | 谓词上下文(窗口 = 5) | 谓词上下文区域标记 | 标注序列 |
...@@ -187,6 +179,8 @@ conll05st-release/ ...@@ -187,6 +179,8 @@ conll05st-release/
获取词典,打印词典大小: 获取词典,打印词典大小:
```python ```python
from __future__ import print_function
import math, os import math, os
import numpy as np import numpy as np
import paddle import paddle
...@@ -201,9 +195,9 @@ word_dict_len = len(word_dict) ...@@ -201,9 +195,9 @@ word_dict_len = len(word_dict)
label_dict_len = len(label_dict) label_dict_len = len(label_dict)
pred_dict_len = len(verb_dict) pred_dict_len = len(verb_dict)
print word_dict_len print('word_dict_len: ', word_dict_len)
print label_dict_len print('label_dict_len: ', label_dict_len)
print pred_dict_len print('pred_dict_len: ', pred_dict_len)
``` ```
## 模型配置说明 ## 模型配置说明
...@@ -431,7 +425,7 @@ def train(use_cuda, save_dirname=None, is_local=True): ...@@ -431,7 +425,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
cost = cost[0] cost = cost[0]
if batch_id % 10 == 0: if batch_id % 10 == 0:
print("avg_cost:" + str(cost)) print("avg_cost: " + str(cost))
if batch_id != 0: if batch_id != 0:
print("second per batch: " + str((time.time( print("second per batch: " + str((time.time(
) - start_time) / batch_id)) ) - start_time) / batch_id))
......
...@@ -175,13 +175,6 @@ The raw data needs to be preprocessed into formats that PaddlePaddle can handle. ...@@ -175,13 +175,6 @@ The raw data needs to be preprocessed into formats that PaddlePaddle can handle.
4. Construct the markings in BIO format; 4. Construct the markings in BIO format;
5. Obtain the integer index corresponding to the word according to the dictionary. 5. Obtain the integer index corresponding to the word according to the dictionary.
```python
# import paddle.v2.dataset.conll05 as conll05
# conll05.corpus_reader does step 1 and 2 as mentioned above.
# conll05.reader_creator does step 3 to 5.
# conll05.test gets preprocessed training instances.
```
After preprocessing, a training sample contains nine features, namely: word sequence, predicate, predicate context (5 columns), region mark sequence, label sequence. The following table is an example of a training sample. After preprocessing, a training sample contains nine features, namely: word sequence, predicate, predicate context (5 columns), region mark sequence, label sequence. The following table is an example of a training sample.
| word sequence | predicate | predicate context(5 columns) | region mark sequence | label sequence| | word sequence | predicate | predicate context(5 columns) | region mark sequence | label sequence|
...@@ -209,6 +202,8 @@ We trained a language model on the English Wikipedia to get a word vector lookup ...@@ -209,6 +202,8 @@ We trained a language model on the English Wikipedia to get a word vector lookup
Here we fetch the dictionary, and print its size: Here we fetch the dictionary, and print its size:
```python ```python
from __future__ import print_function
import math, os import math, os
import numpy as np import numpy as np
import paddle import paddle
...@@ -223,9 +218,9 @@ word_dict_len = len(word_dict) ...@@ -223,9 +218,9 @@ word_dict_len = len(word_dict)
label_dict_len = len(label_dict) label_dict_len = len(label_dict)
pred_dict_len = len(verb_dict) pred_dict_len = len(verb_dict)
print word_dict_len print('word_dict_len: ', word_dict_len)
print label_dict_len print('label_dict_len: ', label_dict_len)
print pred_dict_len print('pred_dict_len: ', pred_dict_len)
``` ```
## Model Configuration ## Model Configuration
...@@ -440,7 +435,7 @@ def train(use_cuda, save_dirname=None, is_local=True): ...@@ -440,7 +435,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
cost = cost[0] cost = cost[0]
if batch_id % 10 == 0: if batch_id % 10 == 0:
print("avg_cost:" + str(cost)) print("avg_cost: " + str(cost))
if batch_id != 0: if batch_id != 0:
print("second per batch: " + str((time.time( print("second per batch: " + str((time.time(
) - start_time) / batch_id)) ) - start_time) / batch_id))
......
...@@ -67,7 +67,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb ...@@ -67,7 +67,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
图1. 依存句法分析句法树示例 图1. 依存句法分析句法树示例
</div> </div>
然而,完全句法分析需要确定句子所包含的全部句法信息,并确定句子各成分之间的关系,是一个非常困难的任务,目前技术下的句法分析准确率并不高,句法分析的细微错误都会导致SRL的错误。为了降低问题的复杂度,同时获得一定的句法结构信息,“浅层句法分析”的思想应运而生。浅层句法分析也称为部分句法分析(partial parsing)或语块划分(chunking)。和完全句法分析得到一颗完整的句法树不同,浅层句法分析只需要识别句子中某些结构相对简单的独立成分,例如:动词短语,这些被识别出来的结构称为语块。为了回避 “无法获得准确率较高的句法树” 所带来的困难,一些研究\[[1](#参考文献)\]也提出了基于语块(chunk)的SRL方法。基于语块的SRL方法将SRL作为一个序列标注问题来解决。序列标注任务一般都会采用BIO表示方式来定义序列标注的标签集,我们先来介绍这种表示方法。在BIO表示法中,B代表语块的开始,I代表语块的中间,O代表语块结束。通过B、I、O 三种标记将不同的语块赋予不同的标签,例如:对于一个角色为A的论元,将它所包含的第一个语块赋予标签B-A,将它所包含的其它语块赋予标签I-A,不属于任何论元的语块赋予标签O。 然而,完全句法分析需要确定句子所包含的全部句法信息,并确定句子各成分之间的关系,是一个非常困难的任务,目前技术下的句法分析准确率并不高,句法分析的细微错误都会导致SRL的错误。为了降低问题的复杂度,同时获得一定的句法结构信息,“浅层句法分析”的思想应运而生。浅层句法分析也称为部分句法分析(partial parsing)或语块划分(chunking)。和完全句法分析得到一颗完整的句法树不同,浅层句法分析只需要识别句子中某些结构相对简单的独立成分,例如:动词短语,这些被识别出来的结构称为语块。为了回避 “无法获得准确率较高的句法树” 所带来的困难,一些研究\[[1](#参考文献)\]也提出了基于语块(chunk)的SRL方法。基于语块的SRL方法将SRL作为一个序列标注问题来解决。序列标注任务一般都会采用BIO表示方式来定义序列标注的标签集,我们先来介绍这种表示方法。在BIO表示法中,B代表语块的开始,I代表语块的中间,O代表语块结束。通过B、I、O 三种标记将不同的语块赋予不同的标签,例如:对于一个由角色A拓展得到的语块组,将它所包含的第一个语块赋予标签B-A,将它所包含的其它语块赋予标签I-A,不属于任何论元的语块赋予标签O。
我们继续以上面的这句话为例,图1展示了BIO表示方法。 我们继续以上面的这句话为例,图1展示了BIO表示方法。
...@@ -193,14 +193,6 @@ conll05st-release/ ...@@ -193,14 +193,6 @@ conll05st-release/
4. 构造以BIO法表示的标记; 4. 构造以BIO法表示的标记;
5. 依据词典获取词对应的整数索引。 5. 依据词典获取词对应的整数索引。
```python
# import paddle.v2.dataset.conll05 as conll05
# conll05.corpus_reader函数完成上面第1步和第2步.
# conll05.reader_creator函数完成上面第3步到第5步.
# conll05.test函数可以获取处理之后的每条样本来供PaddlePaddle训练.
```
预处理完成之后一条训练样本包含9个特征,分别是:句子序列、谓词、谓词上下文(占 5 列)、谓词上下区域标志、标注序列。下表是一条训练样本的示例。 预处理完成之后一条训练样本包含9个特征,分别是:句子序列、谓词、谓词上下文(占 5 列)、谓词上下区域标志、标注序列。下表是一条训练样本的示例。
| 句子序列 | 谓词 | 谓词上下文(窗口 = 5) | 谓词上下文区域标记 | 标注序列 | | 句子序列 | 谓词 | 谓词上下文(窗口 = 5) | 谓词上下文区域标记 | 标注序列 |
...@@ -229,6 +221,8 @@ conll05st-release/ ...@@ -229,6 +221,8 @@ conll05st-release/
获取词典,打印词典大小: 获取词典,打印词典大小:
```python ```python
from __future__ import print_function
import math, os import math, os
import numpy as np import numpy as np
import paddle import paddle
...@@ -243,9 +237,9 @@ word_dict_len = len(word_dict) ...@@ -243,9 +237,9 @@ word_dict_len = len(word_dict)
label_dict_len = len(label_dict) label_dict_len = len(label_dict)
pred_dict_len = len(verb_dict) pred_dict_len = len(verb_dict)
print word_dict_len print('word_dict_len: ', word_dict_len)
print label_dict_len print('label_dict_len: ', label_dict_len)
print pred_dict_len print('pred_dict_len: ', pred_dict_len)
``` ```
## 模型配置说明 ## 模型配置说明
...@@ -473,7 +467,7 @@ def train(use_cuda, save_dirname=None, is_local=True): ...@@ -473,7 +467,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
cost = cost[0] cost = cost[0]
if batch_id % 10 == 0: if batch_id % 10 == 0:
print("avg_cost:" + str(cost)) print("avg_cost: " + str(cost))
if batch_id != 0: if batch_id != 0:
print("second per batch: " + str((time.time( print("second per batch: " + str((time.time(
) - start_time) / batch_id)) ) - start_time) / batch_id))
......
...@@ -217,13 +217,6 @@ The raw data needs to be preprocessed into formats that PaddlePaddle can handle. ...@@ -217,13 +217,6 @@ The raw data needs to be preprocessed into formats that PaddlePaddle can handle.
4. Construct the markings in BIO format; 4. Construct the markings in BIO format;
5. Obtain the integer index corresponding to the word according to the dictionary. 5. Obtain the integer index corresponding to the word according to the dictionary.
```python
# import paddle.v2.dataset.conll05 as conll05
# conll05.corpus_reader does step 1 and 2 as mentioned above.
# conll05.reader_creator does step 3 to 5.
# conll05.test gets preprocessed training instances.
```
After preprocessing, a training sample contains nine features, namely: word sequence, predicate, predicate context (5 columns), region mark sequence, label sequence. The following table is an example of a training sample. After preprocessing, a training sample contains nine features, namely: word sequence, predicate, predicate context (5 columns), region mark sequence, label sequence. The following table is an example of a training sample.
| word sequence | predicate | predicate context(5 columns) | region mark sequence | label sequence| | word sequence | predicate | predicate context(5 columns) | region mark sequence | label sequence|
...@@ -251,6 +244,8 @@ We trained a language model on the English Wikipedia to get a word vector lookup ...@@ -251,6 +244,8 @@ We trained a language model on the English Wikipedia to get a word vector lookup
Here we fetch the dictionary, and print its size: Here we fetch the dictionary, and print its size:
```python ```python
from __future__ import print_function
import math, os import math, os
import numpy as np import numpy as np
import paddle import paddle
...@@ -265,9 +260,9 @@ word_dict_len = len(word_dict) ...@@ -265,9 +260,9 @@ word_dict_len = len(word_dict)
label_dict_len = len(label_dict) label_dict_len = len(label_dict)
pred_dict_len = len(verb_dict) pred_dict_len = len(verb_dict)
print word_dict_len print('word_dict_len: ', word_dict_len)
print label_dict_len print('label_dict_len: ', label_dict_len)
print pred_dict_len print('pred_dict_len: ', pred_dict_len)
``` ```
## Model Configuration ## Model Configuration
...@@ -482,7 +477,7 @@ def train(use_cuda, save_dirname=None, is_local=True): ...@@ -482,7 +477,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
cost = cost[0] cost = cost[0]
if batch_id % 10 == 0: if batch_id % 10 == 0:
print("avg_cost:" + str(cost)) print("avg_cost: " + str(cost))
if batch_id != 0: if batch_id != 0:
print("second per batch: " + str((time.time( print("second per batch: " + str((time.time(
) - start_time) / batch_id)) ) - start_time) / batch_id))
......
from __future__ import print_function
import math, os import math, os
import numpy as np import numpy as np
import paddle import paddle
......
...@@ -85,7 +85,7 @@ ...@@ -85,7 +85,7 @@
2. 将$z_{i+1}$通过`softmax`归一化,得到目标语言序列的第$i+1$个单词的概率分布$p_{i+1}$。概率分布公式如下: 2. 将$z_{i+1}$通过`softmax`归一化,得到目标语言序列的第$i+1$个单词的概率分布$p_{i+1}$。概率分布公式如下:
$$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ $$p\left ( u_{i+1}|u_{<i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
其中$W_sz_{i+1}+b_z$是对每个可能的输出单词进行打分,再用softmax归一化就可以得到第$i+1$个词的概率$p_{i+1}$。 其中$W_sz_{i+1}+b_z$是对每个可能的输出单词进行打分,再用softmax归一化就可以得到第$i+1$个词的概率$p_{i+1}$。
...@@ -132,6 +132,7 @@ ...@@ -132,6 +132,7 @@
下面我们开始根据输入数据的形式配置模型。首先引入所需的库函数以及定义全局变量。 下面我们开始根据输入数据的形式配置模型。首先引入所需的库函数以及定义全局变量。
```python ```python
from __future__ import print_function
import contextlib import contextlib
import numpy as np import numpy as np
...@@ -437,10 +438,13 @@ for data in test_data(): ...@@ -437,10 +438,13 @@ for data in test_data():
result_scores = np.array(results[1]) result_scores = np.array(results[1])
print("Original sentence:") print("Original sentence:")
print(" ".join([src_dict[w] for w in feed_data[0][0]])) print(" ".join([src_dict[w] for w in feed_data[0][0][1:-1]]))
print("Translated sentence:") print("Translated score and sentence:")
print(" ".join([trg_dict[w] for w in result_ids])) for i in xrange(beam_size):
print("Corresponding score: ", result_scores) start_pos = result_ids_lod[1][i] + 1
end_pos = result_ids_lod[1][i+1]
print("%d\t%.4f\t%s\n" % (i+1, result_scores[end_pos-1],
" ".join([trg_dict[w] for w in result_ids[start_pos:end_pos]])))
break break
``` ```
......
...@@ -114,7 +114,7 @@ The goal of the decoder is to maximize the probability of the next correct word ...@@ -114,7 +114,7 @@ The goal of the decoder is to maximize the probability of the next correct word
2. Calculate the probability $p_{i+1}$ for the $i+1$-th word in the target language sequence by normalizing $z_{i+1}$ using `softmax` as follows 2. Calculate the probability $p_{i+1}$ for the $i+1$-th word in the target language sequence by normalizing $z_{i+1}$ using `softmax` as follows
$$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ $$p\left ( u_{i+1}|u_{<i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
where $W_sz_{i+1}+b_z$ scores each possible words and is then normalized via softmax to produce the probability $p_{i+1}$ for the $i+1$-th word. where $W_sz_{i+1}+b_z$ scores each possible words and is then normalized via softmax to produce the probability $p_{i+1}$ for the $i+1$-th word.
...@@ -169,6 +169,7 @@ This subset has 193319 instances of training data and 6003 instances of test dat ...@@ -169,6 +169,7 @@ This subset has 193319 instances of training data and 6003 instances of test dat
Our program starts with importing necessary packages and initializing some global variables: Our program starts with importing necessary packages and initializing some global variables:
```python ```python
from __future__ import print_function
import contextlib import contextlib
import numpy as np import numpy as np
...@@ -485,10 +486,13 @@ for data in test_data(): ...@@ -485,10 +486,13 @@ for data in test_data():
result_scores = np.array(results[1]) result_scores = np.array(results[1])
print("Original sentence:") print("Original sentence:")
print(" ".join([src_dict[w] for w in feed_data[0][0]])) print(" ".join([src_dict[w] for w in feed_data[0][0][1:-1]]))
print("Translated sentence:") print("Translated score and sentence:")
print(" ".join([trg_dict[w] for w in result_ids])) for i in xrange(beam_size):
print("Corresponding score: ", result_scores) start_pos = result_ids_lod[1][i] + 1
end_pos = result_ids_lod[1][i+1]
print("%d\t%.4f\t%s\n" % (i+1, result_scores[end_pos-1],
" ".join([trg_dict[w] for w in result_ids[start_pos:end_pos]])))
break break
``` ```
......
...@@ -127,7 +127,7 @@ ...@@ -127,7 +127,7 @@
2. 将$z_{i+1}$通过`softmax`归一化,得到目标语言序列的第$i+1$个单词的概率分布$p_{i+1}$。概率分布公式如下: 2. 将$z_{i+1}$通过`softmax`归一化,得到目标语言序列的第$i+1$个单词的概率分布$p_{i+1}$。概率分布公式如下:
$$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ $$p\left ( u_{i+1}|u_{<i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
其中$W_sz_{i+1}+b_z$是对每个可能的输出单词进行打分再用softmax归一化就可以得到第$i+1$个词的概率$p_{i+1}$。 其中$W_sz_{i+1}+b_z$是对每个可能的输出单词进行打分再用softmax归一化就可以得到第$i+1$个词的概率$p_{i+1}$。
...@@ -174,6 +174,7 @@ ...@@ -174,6 +174,7 @@
下面我们开始根据输入数据的形式配置模型。首先引入所需的库函数以及定义全局变量。 下面我们开始根据输入数据的形式配置模型。首先引入所需的库函数以及定义全局变量。
```python ```python
from __future__ import print_function
import contextlib import contextlib
import numpy as np import numpy as np
...@@ -479,10 +480,13 @@ for data in test_data(): ...@@ -479,10 +480,13 @@ for data in test_data():
result_scores = np.array(results[1]) result_scores = np.array(results[1])
print("Original sentence:") print("Original sentence:")
print(" ".join([src_dict[w] for w in feed_data[0][0]])) print(" ".join([src_dict[w] for w in feed_data[0][0][1:-1]]))
print("Translated sentence:") print("Translated score and sentence:")
print(" ".join([trg_dict[w] for w in result_ids])) for i in xrange(beam_size):
print("Corresponding score: ", result_scores) start_pos = result_ids_lod[1][i] + 1
end_pos = result_ids_lod[1][i+1]
print("%d\t%.4f\t%s\n" % (i+1, result_scores[end_pos-1],
" ".join([trg_dict[w] for w in result_ids[start_pos:end_pos]])))
break break
``` ```
......
...@@ -156,7 +156,7 @@ The goal of the decoder is to maximize the probability of the next correct word ...@@ -156,7 +156,7 @@ The goal of the decoder is to maximize the probability of the next correct word
2. Calculate the probability $p_{i+1}$ for the $i+1$-th word in the target language sequence by normalizing $z_{i+1}$ using `softmax` as follows 2. Calculate the probability $p_{i+1}$ for the $i+1$-th word in the target language sequence by normalizing $z_{i+1}$ using `softmax` as follows
$$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ $$p\left ( u_{i+1}|u_{<i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
where $W_sz_{i+1}+b_z$ scores each possible words and is then normalized via softmax to produce the probability $p_{i+1}$ for the $i+1$-th word. where $W_sz_{i+1}+b_z$ scores each possible words and is then normalized via softmax to produce the probability $p_{i+1}$ for the $i+1$-th word.
...@@ -211,6 +211,7 @@ This subset has 193319 instances of training data and 6003 instances of test dat ...@@ -211,6 +211,7 @@ This subset has 193319 instances of training data and 6003 instances of test dat
Our program starts with importing necessary packages and initializing some global variables: Our program starts with importing necessary packages and initializing some global variables:
```python ```python
from __future__ import print_function
import contextlib import contextlib
import numpy as np import numpy as np
...@@ -527,10 +528,13 @@ for data in test_data(): ...@@ -527,10 +528,13 @@ for data in test_data():
result_scores = np.array(results[1]) result_scores = np.array(results[1])
print("Original sentence:") print("Original sentence:")
print(" ".join([src_dict[w] for w in feed_data[0][0]])) print(" ".join([src_dict[w] for w in feed_data[0][0][1:-1]]))
print("Translated sentence:") print("Translated score and sentence:")
print(" ".join([trg_dict[w] for w in result_ids])) for i in xrange(beam_size):
print("Corresponding score: ", result_scores) start_pos = result_ids_lod[1][i] + 1
end_pos = result_ids_lod[1][i+1]
print("%d\t%.4f\t%s\n" % (i+1, result_scores[end_pos-1],
" ".join([trg_dict[w] for w in result_ids[start_pos:end_pos]])))
break break
``` ```
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import numpy as np import numpy as np
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -187,10 +188,14 @@ def decode_main(use_cuda): ...@@ -187,10 +188,14 @@ def decode_main(use_cuda):
result_scores = np.array(results[1]) result_scores = np.array(results[1])
print("Original sentence:") print("Original sentence:")
print(" ".join([src_dict[w] for w in feed_data[0][0]])) print(" ".join([src_dict[w] for w in feed_data[0][0][1:-1]]))
print("Translated sentence:") print("Translated score and sentence:")
print(" ".join([trg_dict[w] for w in result_ids])) for i in xrange(beam_size):
print("Corresponding score: ", result_scores) start_pos = result_ids_lod[1][i] + 1
end_pos = result_ids_lod[1][i + 1]
print("%d\t%.4f\t%s\n" % (
i + 1, result_scores[end_pos - 1],
" ".join([trg_dict[w] for w in result_ids[start_pos:end_pos]])))
break break
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册