diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0d8234b69cb092a25eb884a754600168f9a67f75..9fc06f7e45574fac799709255a1341df601058ef 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,16 +1,20 @@ -- repo: https://github.com/Lucas-C/pre-commit-hooks.git - sha: c25201a00e6b0514370501050cf2a8538ac12270 - hooks: - - id: remove-crlf - repo: https://github.com/reyoung/mirrors-yapf.git sha: v0.13.2 hooks: - id: yapf files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$ # Bazel BUILD files follow Python syntax. - repo: https://github.com/pre-commit/pre-commit-hooks - sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469 + sha: v0.7.1 hooks: - id: check-merge-conflict - id: check-symlinks - id: detect-private-key - id: end-of-file-fixer + - id: trailing-whitespace +- repo: git://github.com/Lucas-C/pre-commit-hooks + sha: v1.0.1 + hooks: + - id: forbid-crlf + - id: remove-crlf + - id: forbid-tabs + - id: remove-tabs diff --git a/.tmpl/convert-markdown-into-html.sh b/.tmpl/convert-markdown-into-html.sh index 149c686bc502b7fed97453e0769a7ef6ee841b76..cd3b01adb08d1a6298ba2003bd039375ea50eb1d 100755 --- a/.tmpl/convert-markdown-into-html.sh +++ b/.tmpl/convert-markdown-into-html.sh @@ -67,7 +67,7 @@ marked.setOptions({ } }); document.getElementById("context").innerHTML = marked( - document.getElementById("markdown").innerHTML) + document.getElementById("markdown").innerHTML) EOF diff --git a/.tmpl/marked.js b/.tmpl/marked.js index 3c4fbe885422d11cdfdea4dfcdb71c3f42ef2022..0499d1d4e383ee3f866b9f9eed91ae775fe3da10 100644 --- a/.tmpl/marked.js +++ b/.tmpl/marked.js @@ -1093,7 +1093,7 @@ function escape(html, encode) { } function unescape(html) { - // explicitly match decimal, hex, and named HTML entities + // explicitly match decimal, hex, and named HTML entities return html.replace(/&(#(?:\d+)|(?:#x[0-9A-Fa-f]+)|(?:\w+));?/g, function(_, n) { n = n.toLowerCase(); if (n === 'colon') return ':'; diff --git a/fit_a_line/README.en.md b/fit_a_line/README.en.md index a804ca9192d4df295bce81d9b95f1c69e9478439..21f457c3730c19ffaa6375fb346981c06b9d008f 100644 --- a/fit_a_line/README.en.md +++ b/fit_a_line/README.en.md @@ -1,59 +1,73 @@ # Linear Regression -Let us begin the tutorial with a classical problem called Linear Regression \[[1](#References)\]. In this chapter, we will train a model from a realistic dataset to predict house prices. Some important concepts in Machine Learning will be covered through this example. +Let us begin the tutorial with a classical problem called Linear Regression \[[1](#References)\]. In this chapter, we will train a model from a realistic dataset to predict home prices. Some important concepts in Machine Learning will be covered through this example. -The source code for this tutorial is at [book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/fit_a_line). If this is your first time using PaddlePaddle, please refer to the [Install Guide](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html). +The source code for this tutorial lives on [book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/fit_a_line). For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html). -## Problem -Suppose we have a dataset of $n$ houses. Each house $i$ has $d$ properties and the price $y_i$. A property $x_{i,d}$ describes one aspect of the house, for example, the number of rooms in the house, the number of schools or hospitals in the neighborhood, the nearby traffic condition, etc. Our task is to predict $y_i$ given a set of properties $\{x_{i,1}, ..., x_{i,d}\}$. We assume that the price is a linear combination of all the properties, i.e., +## Problem Setup +Suppose we have a dataset of $n$ real estate properties. These real estate properties will be referred to as *homes* in this chapter for clarity. + +Each home is associated with $d$ attributes. The attributes describe characteristics such the number of rooms in the home, the number of schools or hospitals in the neighborhood, and the traffic condition nearby. + +In our problem setup, the attribute $x_{i,j}$ denotes the $j$th characteristic of the $i$th home. In addition, $y_i$ denotes the price of the $i$th home. Our task is to predict $y_i$ given a set of attributes $\{x_{i,1}, ..., x_{i,d}\}$. We assume that the price of a home is a linear combination of all of its attributes, namely, $$y_i = \omega_1x_{i,1} + \omega_2x_{i,2} + \ldots + \omega_dx_{i,d} + b, i=1,\ldots,n$$ -where $\omega_{d}$ and $b$ are the model parameters we want to estimate. Once they are learned, given a set of properties of a house, we will be able to predict a price for that house. The model we have here is called Linear Regression, namely, we want to regress a value as a linear combination of several values. In practice this linear model for our problem is hardly true, because the real relationship between the house properties and the price is much more complicated. However, due to its simple formulation which makes the model training and analysis easy, Linear Regression has been applied to lots of real problems. It is always an important topic in many classical Statistical Learning and Machine Learning textbooks \[[2,3,4](#References)\]. +where $\vec{\omega}$ and $b$ are the model parameters we want to estimate. Once they are learned, we will be able to predict the price of a home, given the attributes associated with it. We call this model **Linear Regression**. In other words, we want to regress a value against several values linearly. In practice, a linear model is often too simplistic to capture the real relationships between the variables. Yet, because Linear Regression is easy to train and analyze, it has been applied to a large number of real problems. As a result, it is an important topic in many classic Statistical Learning and Machine Learning textbooks \[[2,3,4](#References)\]. ## Results Demonstration -We first show the training result of our model. We use the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) to train a linear model and predict the house prices in Boston. The figure below shows the predictions the model makes for some house prices. The $X$ coordinate of each point represents the median value of the prices of a certain type of houses, while the $Y$ coordinate represents the predicted value by our linear model. When $X=Y$, the point lies exactly on the dotted line. In other words, the more precise the model predicts, the closer the point is to the dotted line. +We first show the result of our model. The dataset [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) is used to train a linear model to predict the home prices in Boston. The figure below shows the predictions the model makes for some home prices. The $X$-axis represents the median value of the prices of simlilar homes within a bin, while the $Y$-axis represents the home value our linear model predicts. The dotted line represents points where $X=Y$. When reading the diagram, the more precise the model predicts, the closer the point is to the dotted line.

-
- Figure 1. Predicted Value V.S. Actual Value +
+ Figure 1. Predicted Value V.S. Actual Value

## Model Overview ### Model Definition -In the UCI Housing Data Set, there are 13 house properties $x_{i,d}$ that are related to the median house price $y_i$. Thus our model is: +In the UCI Housing Data Set, there are 13 home attributes $\{x_{i,j}\}$ that are related to the median home price $y_i$, which we aim to predict. Thus, our model can be written as: $$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$ -where $\hat{Y}$ is the predicted value used to differentiate from the actual value $Y$. The model parameters to be learned are: $\omega_1, \ldots, \omega_{13}, b$, where $\omega$ are called the weights and $b$ is called the bias. +where $\hat{Y}$ is the predicted value used to differentiate from actual value $Y$. The model learns parameters $\omega_1, \ldots, \omega_{13}, b$, where the entries of $\vec{\omega}$ are **weights** and $b$ is **bias**. -Now we need an optimization goal, so that with the learned parameters, $\hat{Y}$ is close to $Y$ as much as possible. Here we introduce the concept of [Loss Function (Cost Function)](https://en.wikipedia.org/wiki/Loss_function). The Loss Function has such property: given any pair of the actual value $y_i$ and the predicted value $\hat{y_i}$, its output is always non-negative. This non-negative value reflects the model error. +Now we need an objective to optimize, so that the learned parameters can make $\hat{Y}$ as close to $Y$ as possible. Let's refer to the concept of [Loss Function (Cost Function)](https://en.wikipedia.org/wiki/Loss_function). A loss function must output a non-negative value, given any pair of the actual value $y_i$ and the predicted value $\hat{y_i}$. This value reflects the magnitutude of the model error. -For Linear Regression, the most common Loss Function is [Mean Square Error (MSE)](https://en.wikipedia.org/wiki/Mean_squared_error) which has the following form: +For Linear Regression, the most common loss function is [Mean Square Error (MSE)](https://en.wikipedia.org/wiki/Mean_squared_error) which has the following form: $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$ -For a dataset of size $n$, MSE is the average value of the $n$ predicted errors. +That is, for a dataset of size $n$, MSE is the average value of the the prediction sqaure errors. ### Training -After defining our model, we have several major steps for the training: -1. Initialize the parameters including the weights $\omega$ and the bias $b$. For example, we can set their mean values as 0s, and their standard deviations as 1s. -2. Feedforward to compute the network output and the Loss Function. -3. Backward to [backpropagate](https://en.wikipedia.org/wiki/Backpropagation) the errors. The errors will be propagated from the output layer back to the input layer, during which the model parameters will be updated with the corresponding errors. +After setting up our model, there are several major steps to go through to train it: +1. Initialize the parameters including the weights $\vec{\omega}$ and the bias $b$. For example, we can set their mean values as $0$s, and their standard deviations as $1$s. +2. Feedforward. Evaluate the network output and compute the corresponding loss. +3. [Backpropagate](https://en.wikipedia.org/wiki/Backpropagation) the errors. The errors will be propagated from the output layer back to the input layer, during which the model parameters will be updated with the corresponding errors. 4. Repeat steps 2~3, until the loss is below a predefined threshold or the maximum number of repeats is reached. -## Data Preparation -Follow the command below to prepare data: -```bash -cd data && python prepare_data.py +## Dataset + +### Python Dataset Modules + +Our program starts with importing necessary packages: + +```python +import paddle.v2 as paddle +import paddle.v2.dataset.uci_housing as uci_housing ``` -This line of code will download the dataset from the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) and perform some [preprocessing](#Preprocessing). The dataset is split into a training set and a test set. -The dataset contains 506 lines in total, each line describing the properties and the median price of a certain type of houses in Boston. The meaning of each line is below: +We encapsulated the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) in our Python module `uci_housing`. This module can +1. download the dataset to `~/.cache/paddle/dataset/uci_housing/housing.data`, if not yet, and +2. [preprocesses](#preprocessing) the dataset. -| Property Name | Explanation | Data Type | +### An Introduction of the Dataset + +The UCI housing dataset has 506 instances. Each instance describes the attributes of a house in surburban Boston. The attributes are explained below: + +| Attribute Name | Characteristic | Data Type | | ------| ------ | ------ | | CRIM | per capita crime rate by town | Continuous| | ZN | proportion of residential land zoned for lots over 25,000 sq.ft. | Continuous | @@ -70,113 +84,115 @@ The dataset contains 506 lines in total, each line describing the properties and | LSTAT | % lower status of the population | Continuous | | MEDV | Median value of owner-occupied homes in $1000's | Continuous | -The last entry is the median house price. +The last entry is the median home price. ### Preprocessing #### Continuous and Discrete Data -We define a feature vector of length 13 for each house, where each entry of the feature vector corresponds to a property of that house. Our first observation is that among the 13 dimensions, there are 12 continuous dimensions and 1 discrete dimension. Note that although a discrete value is also written as digits such as 0, 1, or 2, it has a quite different meaning from a continuous value. The reason is that the difference between two discrete values has no practical meaning. For example, if we use 0, 1, and 2 to represent `red`, `green`, and `blue` respectively, although the numerical difference between `red` and `green` is smaller than that between `red` and `blue`, we cannot say that the extent to which `blue` is different from `red` is greater than the extent to which `green` is different from `red`. Therefore, when handling a discrete feature that has $d$ possible values, we will usually convert it to $d$ new features where each feature can only take 0 or 1, indicating whether the original $d$th value is present or not. Or we can map the discrete feature to a continuous multi-dimensional vector through an embedding table. For our problem here, because CHAS itself is a binary discrete value, we do not need to do any preprocessing. +We define a feature vector of length 13 for each home, where each entry corresponds to an attribute. Our first observation is that, among the 13 dimensions, there are 12 continuous dimensions and 1 discrete dimension. + +Note that although a discrete value is also written as numeric values such as 0, 1, or 2, its meaning differs from a continuous value drastically. The linear difference between two discrete values has no meaning. For example, suppose $0$, $1$, and $2$ are used to represent colors *Red*, *Green*, and *Blue* respectively. Judging from the numeric representation of these colors, *Red* differs more from *Blue* than it does from *Green*. Yet in actuality, it is not true that extent to which the color *Blue* is different from *Red* is greater than the extent to which *Green* is different from *Red*. Therefore, when handling a discrete feature that has $d$ possible values, we usually convert it to $d$ new features where each feature takes a binary value, $0$ or $1$, indicating whether the original value is absent or present. Alternatively, the discrete features can be mapped onto a continuous multi-dimensional vector through an embedding table. For our problem here, because CHAS itself is a binary discrete value, we do not need to do any preprocessing. #### Feature Normalization -Another observation we have is that there is a huge difference among the value ranges of the 13 features (Figure 2). For example, feature B has a value range of [0.32, 396.90] while feature NOX has a range of [0.3850, 0.8170]. For an effective optimization, here we need data normalization. The goal of data normalization is to scale each feature into roughly the same value range, for example [-0.5, 0.5]. In this example, we adopt a standard way of normalization: substracting the mean value from the feature and divide the result by the original value range. +We also observe a huge difference among the value ranges of the 13 features (Figure 2). For instance, the values of feature *B* fall in $[0.32, 396.90]$, whereas those of feature *NOX* has a range of $[0.3850, 0.8170]$. An effective optimization would require data normalization. The goal of data normalization is to scale te values of each feature into roughly the same range, perhaps $[-0.5, 0.5]$. Here, we adopt a popular normalization technique where we substract the mean value from the feature value and divide the result by the width of the original range. There are at least three reasons for [Feature Normalization](https://en.wikipedia.org/wiki/Feature_scaling) (Feature Scaling): - A value range that is too large or too small might cause floating number overflow or underflow during computation. -- Different value ranges might result in different importances of different features to the model (at least in the beginning of the training process), which however is an unreasonable assumption. Such assumption makes the optimization more difficult and increases the training time a lot. -- Many Machine Learning techniques or models (e.g., L1/L2 regularization and Vector Space Model) are based on the assumption that all the features have roughly zero means and their value ranges are similar. +- Different value ranges might result in varying *importances* of different features to the model (at least in the beginning of the training process). This assumption about the data is often unreasonable, making the optimization difficult, which in turn results in increased training time. +- Many machine learning techniques or models (e.g., *L1/L2 regularization* and *Vector Space Model*) assumes that all the features have roughly zero means and their value ranges are similar.

-
- Figure 2. The value ranges of the features +
+ Figure 2. The value ranges of the features

#### Prepare Training and Test Sets -We split the dataset into two subsets, one for estimating the model parameters, namely, model training, and the other for model testing. The model error on the former is called the **training error**, and the error on the latter is called the **test error**. Our goal of training a model is to find the statistical dependency between the outputs and the inputs, so that we can predict new outputs given new inputs. As a result, the test error reflects the performance of the model better than the training error does. We consider two things when deciding the ratio of the training set to the test set: 1) More training data will decrease the variance of the parameter estimation, yielding more reliable models; 2) More test data will decrease the variance of the test error, yielding more reliable test errors. One standard split ratio is $8:2$. You can try different split ratios to observe how the two variances change. +We split the dataset in two, one for adjusting the model parameters, namely, for model training, and the other for model testing. The model error on the former is called the **training error**, and the error on the latter is called the **test error**. Our goal in training a model is to find the statistical dependency between the outputs and the inputs, so that we can predict new outputs given new inputs. As a result, the test error reflects the performance of the model better than the training error does. We consider two things when deciding the ratio of the training set to the test set: 1) More training data will decrease the variance of the parameter estimation, yielding more reliable models; 2) More test data will decrease the variance of the test error, yielding more reliable test errors. One standard split ratio is $8:2$. + + +When training complex models, we usually have one more split: the validation set. Complex models usually have [Hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_optimization) that need to be set before the training process, such as the number of layers in the network. Because hyperparameters are not part of the model parameters, they cannot be trained using the same loss function. Thus we will try several sets of hyperparameters to train several models and cross-validate them on the validation set to pick the best one; finally, the selected trained model is tested on the test set. Because our model is relatively simple, we will omit this validation process. + + +## Training + +`fit_a_line/trainer.py` demonstrates the training using [PaddlePaddle](http://paddlepaddle.org). + +### Initialize PaddlePaddle -Executing the following command to split the dataset and write the training and test set into the `train.list` and `test.list` files, so that later PaddlePaddle can read from them. ```python -python prepare_data.py -r 0.8 #8:2 is the default split ratio +paddle.init(use_gpu=False, trainer_count=1) ``` -When training complex models, we usually have one more split: the validation set. Complex models usually have [Hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_optimization) that need to be set before the training process begins. These hyperparameters are not part of the model parameters and cannot be trained using the same Loss Function (e.g., the number of layers in the network). Thus we will try several sets of hyperparameters to get several models, and compare these trained models on the validation set to pick the best one, and finally it on the test set. Because our model is relatively simple in this problem, we ignore this validation process for now. +### Model Configuration -### Provide Data to PaddlePaddle -After the data is prepared, we use a Python Data Provider to provide data for PaddlePaddle. A Data Provider is a Python function which will be called by PaddlePaddle during training. In this example, the Data Provider only needs to read the data and return it to the training process of PaddlePaddle line by line. +Logistic regression is essentially a fully-connected layer with linear activation: ```python -from paddle.trainer.PyDataProvider2 import * -import numpy as np -#define data type and dimensionality -@provider(input_types=[dense_vector(13), dense_vector(1)]) -def process(settings, input_file): - data = np.load(input_file.strip()) - for row in data: - yield row[:-1].tolist(), row[-1:].tolist() +x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) +y_predict = paddle.layer.fc(input=x, + size=1, + act=paddle.activation.Linear()) +y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) +cost = paddle.layer.regression_cost(input=y_predict, label=y) +``` +### Create Parameters +```python +parameters = paddle.parameters.create(cost) ``` -## Model Configuration +### Create Trainer -### Data Definition -We first call the function `define_py_data_sources2` to let PaddlePaddle read training and test data from the `dataprovider.py` in the above. PaddlePaddle can accept configuration info from the command line, for example, here we pass a variable named `is_predict` to control the model to have different structures during training and test. ```python -from paddle.trainer_config_helpers import * +optimizer = paddle.optimizer.Momentum(momentum=0) -is_predict = get_config_arg('is_predict', bool, False) +trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=optimizer) +``` -define_py_data_sources2( - train_list='data/train.list', - test_list='data/test.list', - module='dataprovider', - obj='process') +### Feeding Data -``` +PaddlePaddle provides the +[reader mechanism](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader) +for loadinng training data. A reader may return multiple columns, and we need a Python dictionary to specify the mapping from column index to data layers. -### Algorithm Settings -Next we need to set the details of the optimization algorithm. Due to the simplicity of the Linear Regression model, we only need to set the `batch_size` which defines how many samples are used every time for updating the parameters. ```python -settings(batch_size=2) +feeding={'x': 0, 'y': 1} ``` -### Network -Finally, we use `fc_layer` and `LinearActivation` to represent the Linear Regression model. +Moreover, an event handler is provided to print the training progress: + ```python -#input data of 13 dimensional house information -x = data_layer(name='x', size=13) - -y_predict = fc_layer( - input=x, - param_attr=ParamAttr(name='w'), - size=1, - act=LinearActivation(), - bias_attr=ParamAttr(name='b')) - -if not is_predict: #when training, we use MSE (i.e., regression_cost) as the Loss Function - y = data_layer(name='y', size=1) - cost = regression_cost(input=y_predict, label=y) - outputs(cost) #output MSE to view the loss change -else: #during test, output the prediction value - outputs(y_predict) +# event_handler to print training and testing info +def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "Pass %d, Batch %d, Cost %f" % ( + event.pass_id, event.batch_id, event.cost) + + if isinstance(event, paddle.event.EndPass): + result = trainer.test( + reader=paddle.batch( + uci_housing.test(), batch_size=2), + feeding=feeding) + print "Test %d, Cost %f" % (event.pass_id, result.cost) ``` -## Training Model -We can run the PaddlePaddle command line trainer in the root directory of the code. Here we name the configuration file as `trainer_config.py`. We train 30 passes and save the result in the directory `output`: -```bash -./train.sh -``` +### Start Training -## Use Model -Now we can use the trained model to do prediction. -```bash -python predict.py -``` -Here by default we use the model in `output/pass-00029` for prediction, and compare the actual house price with the predicted one. The result is shown in `predictions.png`. -If you want to use another model or test on other data, you can pass in a new model path or data path: -```bash -python predict.py -m output/pass-00020 -t data/housing.test.npy +```python +trainer.train( + reader=paddle.batch( + paddle.reader.shuffle( + uci_housing.train(), buf_size=500), + batch_size=2), + feeding=feeding, + event_handler=event_handler, + num_passes=30) ``` ## Summary -In this chapter, we have introduced the Linear Regression model using the UCI Housing Data Set as an example. We have shown how to train and test this model with PaddlePaddle. Many more complex models and techniques are derived from this simple linear model, thus it is important for us to understand how it works. +This chapter introduces *Linear Regression* and how to train and test this model with PaddlePaddle, using the UCI Housing Data Set. Because a large number of more complex models and techniques are derived from linear regression, it is important to understand its underlying theory and limitation. ## References diff --git a/fit_a_line/README.md b/fit_a_line/README.md index c8becc9de54649648ba27654c607d9ca68ae53fa..266c6e91cb4d5249997203cada7ef920d3744386 100644 --- a/fit_a_line/README.md +++ b/fit_a_line/README.md @@ -45,14 +45,25 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$ 3. 根据损失函数进行反向误差传播 ([backpropagation](https://en.wikipedia.org/wiki/Backpropagation)),将网络误差从输出层依次向前传递, 并更新网络中的参数。 4. 重复2~3步骤,直至网络训练误差达到规定的程度或训练轮次达到设定值。 +## 数据集 -## 数据准备 -执行以下命令来准备数据: -```bash -cd data && python prepare_data.py +### 数据集接口的封装 +首先加载需要的包 + +```python +import paddle.v2 as paddle +import paddle.v2.dataset.uci_housing as uci_housing ``` -这段代码将从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)下载数据并进行[预处理](#数据预处理),最后数据将被分为训练集和测试集。 +我们通过uci_housing模块引入了数据集合[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) + +其中,在uci_housing模块中封装了: + +1. 数据下载的过程。下载数据保存在~/.cache/paddle/dataset/uci_housing/housing.data。 +2. [数据预处理](#数据预处理)的过程。 + + +### 数据集介绍 这份数据集共506行,每行包含了波士顿郊区的一类房屋的相关信息及该类房屋价格的中位数。其各维属性的意义如下: | 属性名 | 解释 | 类型 | @@ -90,89 +101,89 @@ cd data && python prepare_data.py

#### 整理训练集与测试集 -我们将数据集分割为两份:一份用于调整模型的参数,即进行模型的训练,模型在这份数据集上的误差被称为**训练误差**;另外一份被用来测试,模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据,所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素:更多的训练数据会降低参数估计的方差,从而得到更可信的模型;而更多的测试数据会降低测试误差的方差,从而得到更可信的测试误差。一种常见的分割比例为$8:2$,感兴趣的读者朋友们也可以尝试不同的设置来观察这两种误差的变化。 +我们将数据集分割为两份:一份用于调整模型的参数,即进行模型的训练,模型在这份数据集上的误差被称为**训练误差**;另外一份被用来测试,模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据,所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素:更多的训练数据会降低参数估计的方差,从而得到更可信的模型;而更多的测试数据会降低测试误差的方差,从而得到更可信的测试误差。我们这个例子中设置的分割比例为$8:2$ + + +在更复杂的模型训练过程中,我们往往还会多使用一种数据集:验证集。因为复杂的模型中常常还有一些超参数([Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization))需要调节,所以我们会尝试多种超参数的组合来分别训练多个模型,然后对比它们在验证集上的表现选择相对最好的一组超参数,最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单,我们暂且忽略掉这个过程。 + +## 训练 + +`fit_a_line/trainer.py`演示了训练的整体过程。 + +### 初始化PaddlePaddle -执行如下命令可以分割数据集,并将训练集和测试集的地址分别写入train.list 和 test.list两个文件中,供PaddlePaddle读取。 ```python -python prepare_data.py -r 0.8 #默认使用8:2的比例进行分割 +paddle.init(use_gpu=False, trainer_count=1) ``` -在更复杂的模型训练过程中,我们往往还会多使用一种数据集:验证集。因为复杂的模型中常常还有一些超参数([Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization))需要调节,所以我们会尝试多种超参数的组合来分别训练多个模型,然后对比它们在验证集上的表现选择相对最好的一组超参数,最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单,我们暂且忽略掉这个过程。 +### 模型配置 -### 提供数据给PaddlePaddle -准备好数据之后,我们使用一个Python data provider来为PaddlePaddle的训练过程提供数据。一个 data provider 就是一个Python函数,它会被PaddlePaddle的训练过程调用。在这个例子里,只需要读取已经保存好的数据,然后一行一行地返回给PaddlePaddle的训练进程即可。 +线性回归的模型其实就是一个采用线性激活函数(linear activation,`LinearActivation`)的全连接层(fully-connected layer,`fc_layer`): ```python -from paddle.trainer.PyDataProvider2 import * -import numpy as np -#定义数据的类型和维度 -@provider(input_types=[dense_vector(13), dense_vector(1)]) -def process(settings, input_file): - data = np.load(input_file.strip()) - for row in data: - yield row[:-1].tolist(), row[-1:].tolist() +x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) +y_predict = paddle.layer.fc(input=x, + size=1, + act=paddle.activation.Linear()) +y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) +cost = paddle.layer.regression_cost(input=y_predict, label=y) +``` +### 创建参数 +```python +parameters = paddle.parameters.create(cost) ``` -## 模型配置说明 +### 创建Trainer -### 数据定义 -首先,通过 `define_py_data_sources2` 来配置PaddlePaddle从上面的`dataprovider.py`里读入训练数据和测试数据。 PaddlePaddle接受从命令行读入的配置信息,例如这里我们传入一个名为`is_predict`的变量来控制模型在训练和测试时的不同结构。 ```python -from paddle.trainer_config_helpers import * +optimizer = paddle.optimizer.Momentum(momentum=0) -is_predict = get_config_arg('is_predict', bool, False) +trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=optimizer) +``` -define_py_data_sources2( - train_list='data/train.list', - test_list='data/test.list', - module='dataprovider', - obj='process') +### 读取数据且打印训练的中间信息 -``` +PaddlePaddle提供一个 +[reader机制](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader) +来读取数据。 Reader返回的数据可以包括多列,我们需要一个Python dict把列 +序号映射到网络里的数据层。 -### 算法配置 -接着,指定模型优化算法的细节。由于线性回归模型比较简单,我们只要设置基本的`batch_size`即可,它指定每次更新参数的时候使用多少条数据计算梯度信息。 ```python -settings(batch_size=2) +feeding={'x': 0, 'y': 1} ``` -### 网络结构 -最后,使用`fc_layer`和`LinearActivation`来表示线性回归的模型本身。 +此外,我们还可以提供一个 event handler,来打印训练的进度: + ```python -#输入数据,13维的房屋信息 -x = data_layer(name='x', size=13) - -y_predict = fc_layer( - input=x, - param_attr=ParamAttr(name='w'), - size=1, - act=LinearActivation(), - bias_attr=ParamAttr(name='b')) - -if not is_predict: #训练时,我们使用MSE,即regression_cost作为损失函数 - y = data_layer(name='y', size=1) - cost = regression_cost(input=y_predict, label=y) - outputs(cost) #训练时输出MSE来监控损失的变化 -else: #测试时,输出预测值 - outputs(y_predict) +# event_handler to print training and testing info +def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "Pass %d, Batch %d, Cost %f" % ( + event.pass_id, event.batch_id, event.cost) + + if isinstance(event, paddle.event.EndPass): + result = trainer.test( + reader=paddle.batch( + uci_housing.test(), batch_size=2), + feeding=feeding) + print "Test %d, Cost %f" % (event.pass_id, result.cost) ``` -## 训练模型 -在对应代码的根目录下执行PaddlePaddle的命令行训练程序。这里指定模型配置文件为`trainer_config.py`,训练30轮,结果保存在`output`路径下。 -```bash -./train.sh -``` +### 开始训练 -## 应用模型 -现在来看下如何使用已经训练好的模型进行预测。 -```bash -python predict.py -``` -这里默认使用`output/pass-00029`中保存的模型进行预测,并将数据中的房价与预测结果进行对比,结果保存在 `predictions.png`中。 -如果你想使用别的模型或者其它的数据进行预测,只要传入新的路径即可: -```bash -python predict.py -m output/pass-00020 -t data/housing.test.npy +```python +trainer.train( + reader=paddle.batch( + paddle.reader.shuffle( + uci_housing.train(), buf_size=500), + batch_size=2), + feeding=feeding, + event_handler=event_handler, + num_passes=30) ``` ## 总结 diff --git a/fit_a_line/index.en.html b/fit_a_line/index.en.html index b2492b2c8d0ab1126ba444acc669102bc02ebdfb..43206eb21b16ebcf3144731752424fe6ef68a568 100644 --- a/fit_a_line/index.en.html +++ b/fit_a_line/index.en.html @@ -54,8 +54,8 @@ where $\omega_{d}$ and $b$ are the model parameters we want to estimate. Once th ## Results Demonstration We first show the training result of our model. We use the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) to train a linear model and predict the house prices in Boston. The figure below shows the predictions the model makes for some house prices. The $X$ coordinate of each point represents the median value of the prices of a certain type of houses, while the $Y$ coordinate represents the predicted value by our linear model. When $X=Y$, the point lies exactly on the dotted line. In other words, the more precise the model predicts, the closer the point is to the dotted line.

-
- Figure 1. Predicted Value V.S. Actual Value +
+ Figure 1. Predicted Value V.S. Actual Value

## Model Overview @@ -126,8 +126,8 @@ There are at least three reasons for [Feature Normalization](https://en.wikipedi - Many Machine Learning techniques or models (e.g., L1/L2 regularization and Vector Space Model) are based on the assumption that all the features have roughly zero means and their value ranges are similar.

-
- Figure 2. The value ranges of the features +
+ Figure 2. The value ranges of the features

#### Prepare Training and Test Sets @@ -151,7 +151,7 @@ import numpy as np def process(settings, input_file): data = np.load(input_file.strip()) for row in data: - yield row[:-1].tolist(), row[-1:].tolist() + yield row[:-1].tolist(), row[-1:].tolist() ``` @@ -246,6 +246,6 @@ marked.setOptions({ } }); document.getElementById("context").innerHTML = marked( - document.getElementById("markdown").innerHTML) + document.getElementById("markdown").innerHTML) diff --git a/fit_a_line/index.html b/fit_a_line/index.html index 7bb9e8a2f6b69b766f17eb72d5cd9d9844138b2d..d52c5f9fd169482cc75375939c1cfff7433b562f 100644 --- a/fit_a_line/index.html +++ b/fit_a_line/index.html @@ -56,8 +56,8 @@ $$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b, i=1,\ldo ## 效果展示 我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中,每个点的横坐标表示同一类房屋真实价格的中位数,纵坐标表示线性回归模型根据特征预测的结果,当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确,则点离虚线越近。

-
- 图1. 预测值 V.S. 真实值 +
+ 图1. 预测值 V.S. 真实值

## 模型概览 @@ -126,8 +126,8 @@ cd data && python prepare_data.py - 很多的机器学习技巧/模型(例如L1,L2正则项,向量空间模型-Vector Space Model)都基于这样的假设:所有的属性取值都差不多是以0为均值且取值范围相近的。

-
- 图2. 各维属性的取值范围 +
+ 图2. 各维属性的取值范围

#### 整理训练集与测试集 @@ -151,7 +151,7 @@ import numpy as np def process(settings, input_file): data = np.load(input_file.strip()) for row in data: - yield row[:-1].tolist(), row[-1:].tolist() + yield row[:-1].tolist(), row[-1:].tolist() ``` @@ -246,6 +246,6 @@ marked.setOptions({ } }); document.getElementById("context").innerHTML = marked( - document.getElementById("markdown").innerHTML) + document.getElementById("markdown").innerHTML) diff --git a/fit_a_line/train.py b/fit_a_line/train.py new file mode 100644 index 0000000000000000000000000000000000000000..df665c436872bbaaf5c08790cba69c8ac17e5db7 --- /dev/null +++ b/fit_a_line/train.py @@ -0,0 +1,53 @@ +import paddle.v2 as paddle +import paddle.v2.dataset.uci_housing as uci_housing + + +def main(): + # init + paddle.init(use_gpu=False, trainer_count=1) + + # network config + x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) + y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) + y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) + cost = paddle.layer.regression_cost(input=y_predict, label=y) + + # create parameters + parameters = paddle.parameters.create(cost) + + # create optimizer + optimizer = paddle.optimizer.Momentum(momentum=0) + + trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=optimizer) + + feeding = {'x': 0, 'y': 1} + + # event_handler to print training and testing info + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "Pass %d, Batch %d, Cost %f" % ( + event.pass_id, event.batch_id, event.cost) + + if isinstance(event, paddle.event.EndPass): + result = trainer.test( + reader=paddle.batch( + uci_housing.test(), batch_size=2), + feeding=feeding) + print "Test %d, Cost %f" % (event.pass_id, result.cost) + + # training + trainer.train( + reader=paddle.batch( + paddle.reader.shuffle( + uci_housing.train(), buf_size=500), + batch_size=2), + feeding=feeding, + event_handler=event_handler, + num_passes=30) + + +if __name__ == '__main__': + main() diff --git a/gan/index.html b/gan/index.html index 1f88c3593d9d0027c58c83fb78543aae95c2b5b5..13561cada8bbb199014ab2169cc2d46e2ad69f24 100644 --- a/gan/index.html +++ b/gan/index.html @@ -58,6 +58,6 @@ marked.setOptions({ } }); document.getElementById("context").innerHTML = marked( - document.getElementById("markdown").innerHTML) + document.getElementById("markdown").innerHTML) diff --git a/image_caption/index.html b/image_caption/index.html index bd5f85aff99e291b01dc091f7a3d4ac622bce4a6..50660d51f763d0675fa13d770a92bf5242f63788 100644 --- a/image_caption/index.html +++ b/image_caption/index.html @@ -57,6 +57,6 @@ marked.setOptions({ } }); document.getElementById("context").innerHTML = marked( - document.getElementById("markdown").innerHTML) + document.getElementById("markdown").innerHTML) diff --git a/image_classification/README.md b/image_classification/README.md index 538760d429d6f250eed2ce578e10001526d11abb..829e99a2b9cb9819d87df9bfc53dd81f1a2a6147 100644 --- a/image_classification/README.md +++ b/image_classification/README.md @@ -3,7 +3,7 @@ 本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/image_classification), 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。 -## 背景介绍 +## 背景介绍 图像相比文字能够提供更加生动、容易理解及更具艺术感的信息,是人们转递与交换信息的重要来源。在本教程中,我们专注于图像识别领域的一个重要问题,即图像分类。 @@ -51,7 +51,7 @@ 2). **特征编码**: 底层特征中包含了大量冗余与噪声,为了提高特征表达的鲁棒性,需要使用一种特征变换算法对底层特征进行编码,称作特征编码。常用的特征编码包括向量量化编码 \[[4](#参考文献)\]、稀疏编码 \[[5](#参考文献)\]、局部线性约束编码 \[[6](#参考文献)\]、Fisher向量编码 \[[7](#参考文献)\] 等。 3). **空间特征约束**: 特征编码之后一般会经过空间特征约束,也称作**特征汇聚**。特征汇聚是指在一个空间范围内,对每一维特征取最大值或者平均值,可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法,这种方法提出将图像均匀分块,在分块内做特征汇聚。 4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述,接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器,在传统图像分类任务上性能很好。 - + 这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \[[18](#参考文献)\]。[NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征,两个非线性编码器以及SVM分类器获得图像分类的冠军 \[[8](#参考文献)\]。 Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破,效果大幅度超越传统方法,获得了ILSVRC2012冠军,该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后,涌现了一系列CNN模型,不断地在ImageNet上刷新成绩,如图4展示。随着模型变得越来越深以及精妙的结构设计,Top-5的错误率也越来越低,降到了3.5%附近。而在同样的ImageNet数据集上,人眼的辨识错误率大概在5.1%,也就是目前的深度学习模型的识别能力已经超过了人眼。 @@ -67,8 +67,8 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得


-图5. CNN网络示例[20] -

+图5. CNN网络示例[20] +

- 卷积层(convolution layer): 执行卷积操作提取底层到高层的特征,发掘出图片局部关联性质和空间不变性质。 - 池化层(pooling layer): 执行降采样操作。通过取卷积输出特征图中局部区块的最大值(max-pooling)或者均值(avg-pooling)。降采样也是图像处理中常见的一种操作,可以过滤掉一些不重要的高频信息。 @@ -108,7 +108,7 @@ GoogleNet整体网络结构如图8所示,总共22层网络:开始由3层普


-图8. GoogleNet[12] +图8. GoogleNet[12]

@@ -174,7 +174,7 @@ paddle.init(use_gpu=False, trainer_count=1) 1. 定义数据输入及其维度 网络输入定义为 `data_layer` (数据层),在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图,因此输入数据大小为3072(3x32x32),类别大小为10,即10分类。 - + ```python datadim = 3 * 32 * 32 classdim = 10 @@ -189,7 +189,7 @@ paddle.init(use_gpu=False, trainer_count=1) net = vgg_bn_drop(image) ``` VGG核心模块的输入是数据层,`vgg_bn_drop` 定义了16层VGG结构,每层卷积后面引入BN层和Dropout层,详细的定义如下: - + ```python def vgg_bn_drop(input): def conv_block(ipt, num_filter, groups, dropouts, num_channels=None): @@ -220,11 +220,11 @@ paddle.init(use_gpu=False, trainer_count=1) fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear()) return fc2 ``` - + 2.1. 首先定义了一组卷积网络,即conv_block。卷积核大小为3x3,池化窗口大小为2x2,窗口滑动大小为2,groups决定每组VGG模块是几次连续的卷积操作,dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块,由若干组 `Conv->BN->ReLu->Dropout` 和 一组 `Pooling` 组成, - + 2.2. 五组卷积操作,即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0,即不使用Dropout操作。 - + 2.3. 最后接两层512维的全连接。 3. 定义分类器 @@ -240,7 +240,7 @@ paddle.init(use_gpu=False, trainer_count=1) 4. 定义损失函数和网络输出 在有监督训练中需要输入图像对应的类别信息,同样通过`paddle.layer.data`来定义。训练中采用多类交叉熵作为损失函数,并作为网络的输出,预测阶段定义网络的输出为分类器得到的概率信息。 - + ```python lbl = paddle.layer.data( name="label", type=paddle.data_type.integer_value(classdim)) @@ -305,9 +305,9 @@ def layer_warp(block_func, ipt, features, count, stride): `resnet_cifar10` 的连接结构主要有以下几个过程。 -1. 底层输入连接一层 `conv_bn_layer`,即带BN的卷积层。 +1. 底层输入连接一层 `conv_bn_layer`,即带BN的卷积层。 2. 然后连接3组残差模块即下面配置3组 `layer_warp` ,每组采用图 10 左边残差模块组成。 -3. 最后对网络做均值池化并返回该层。 +3. 最后对网络做均值池化并返回该层。 注意:除过第一层卷积层和最后一层全连接层之外,要求三组 `layer_warp` 总的含参层数能够被6整除,即 `resnet_cifar10` 的 depth 要满足 $(depth - 2) % 6 == 0$ 。 @@ -452,7 +452,7 @@ Test with Pass 0, {'classification_error_evaluator': 0.885200023651123} [2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005. -[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28. +[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28. [4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003. diff --git a/image_classification/deprecated/README.md b/image_classification/deprecated/README.md index f70b819af68e4b10b756e2daaac60202c71cdc7d..a82a32c8a84cedd5da05e2a66f791819f09f65cb 100644 --- a/image_classification/deprecated/README.md +++ b/image_classification/deprecated/README.md @@ -3,7 +3,7 @@ 本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/image_classification), 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。 -## 背景介绍 +## 背景介绍 图像相比文字能够提供更加生动、容易理解及更具艺术感的信息,是人们转递与交换信息的重要来源。在本教程中,我们专注于图像识别领域的一个重要问题,即图像分类。 @@ -51,7 +51,7 @@ 2). **特征编码**: 底层特征中包含了大量冗余与噪声,为了提高特征表达的鲁棒性,需要使用一种特征变换算法对底层特征进行编码,称作特征编码。常用的特征编码包括向量量化编码 \[[4](#参考文献)\]、稀疏编码 \[[5](#参考文献)\]、局部线性约束编码 \[[6](#参考文献)\]、Fisher向量编码 \[[7](#参考文献)\] 等。 3). **空间特征约束**: 特征编码之后一般会经过空间特征约束,也称作**特征汇聚**。特征汇聚是指在一个空间范围内,对每一维特征取最大值或者平均值,可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法,这种方法提出将图像均匀分块,在分块内做特征汇聚。 4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述,接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器,在传统图像分类任务上性能很好。 - + 这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \[[18](#参考文献)\]。[NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征,两个非线性编码器以及SVM分类器获得图像分类的冠军 \[[8](#参考文献)\]。 Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破,效果大幅度超越传统方法,获得了ILSVRC2012冠军,该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后,涌现了一系列CNN模型,不断地在ImageNet上刷新成绩,如图4展示。随着模型变得越来越深以及精妙的结构设计,Top-5的错误率也越来越低,降到了3.5%附近。而在同样的ImageNet数据集上,人眼的辨识错误率大概在5.1%,也就是目前的深度学习模型的识别能力已经超过了人眼。 @@ -67,8 +67,8 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得


-图5. CNN网络示例[20] -

+图5. CNN网络示例[20] +

- 卷积层(convolution layer): 执行卷积操作提取底层到高层的特征,发掘出图片局部关联性质和空间不变性质。 - 池化层(pooling layer): 执行降采样操作。通过取卷积输出特征图中局部区块的最大值(max-pooling)或者均值(avg-pooling)。降采样也是图像处理中常见的一种操作,可以过滤掉一些不重要的高频信息。 @@ -108,7 +108,7 @@ GoogleNet整体网络结构如图8所示,总共22层网络:开始由3层普


-图8. GoogleNet[12] +图8. GoogleNet[12]

@@ -245,7 +245,7 @@ $$ lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$ 1. 定义数据输入及其维度 网络输入定义为 `data_layer` (数据层),在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图,因此输入数据大小为3072(3x32x32),类别大小为10,即10分类。 - + ```python datadim = 3 * 32 * 32 classdim = 10 @@ -258,7 +258,7 @@ $$ lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$ net = vgg_bn_drop(data) ``` VGG核心模块的输入是数据层,`vgg_bn_drop` 定义了16层VGG结构,每层卷积后面引入BN层和Dropout层,详细的定义如下: - + ```python def vgg_bn_drop(input, num_channels): def conv_block(ipt, num_filter, groups, dropouts, num_channels_=None): @@ -273,26 +273,26 @@ $$ lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$ conv_with_batchnorm=True, conv_batchnorm_drop_rate=dropouts, pool_type=MaxPooling()) - + conv1 = conv_block(input, 64, 2, [0.3, 0], 3) conv2 = conv_block(conv1, 128, 2, [0.4, 0]) conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) - + drop = dropout_layer(input=conv5, dropout_rate=0.5) fc1 = fc_layer(input=drop, size=512, act=LinearActivation()) bn = batch_norm_layer( input=fc1, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5)) fc2 = fc_layer(input=bn, size=512, act=LinearActivation()) return fc2 - + ``` - + 2.1. 首先定义了一组卷积网络,即conv_block。卷积核大小为3x3,池化窗口大小为2x2,窗口滑动大小为2,groups决定每组VGG模块是几次连续的卷积操作,dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.trainer_config_helpers`中预定义的模块,由若干组 `Conv->BN->ReLu->Dropout` 和 一组 `Pooling` 组成, - + 2.2. 五组卷积操作,即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0,即不使用Dropout操作。 - + 2.3. 最后接两层512维的全连接。 3. 定义分类器 @@ -306,7 +306,7 @@ $$ lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$ 4. 定义损失函数和网络输出 在有监督训练中需要输入图像对应的类别信息,同样通过`data_layer`来定义。训练中采用多类交叉熵作为损失函数,并作为网络的输出,预测阶段定义网络的输出为分类器得到的概率信息。 - + ```python if not is_predict: lbl = data_layer(name="label", size=class_num) @@ -383,9 +383,9 @@ def layer_warp(block_func, ipt, features, count, stride): `resnet_cifar10` 的连接结构主要有以下几个过程。 -1. 底层输入连接一层 `conv_bn_layer`,即带BN的卷积层。 +1. 底层输入连接一层 `conv_bn_layer`,即带BN的卷积层。 2. 然后连接3组残差模块即下面配置3组 `layer_warp` ,每组采用图 10 左边残差模块组成。 -3. 最后对网络做均值池化并返回该层。 +3. 最后对网络做均值池化并返回该层。 注意:除过第一层卷积层和最后一层全连接层之外,要求三组 `layer_warp` 总的含参层数能够被6整除,即 `resnet_cifar10` 的 depth 要满足 $(depth - 2) % 6 == 0$ 。 @@ -487,7 +487,7 @@ python classify.py --job=extract --model=output/pass-00299 --data=image/dog.png


-图13. 卷积特征可视化图 +图13. 卷积特征可视化图

## 总结 @@ -501,7 +501,7 @@ python classify.py --job=extract --model=output/pass-00299 --data=image/dog.png [2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005. -[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28. +[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28. [4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003. diff --git a/image_classification/index.en.html b/image_classification/index.en.html index b6a80785027068f062e5e3ae21eec6ef7c4f143b..03323fd89f07ad6b7fab84d05f578982760ffa40 100644 --- a/image_classification/index.en.html +++ b/image_classification/index.en.html @@ -289,48 +289,48 @@ First we define VGG network. Since the image size and amount of CIFAR10 are rela The input to the network is defined as `data_layer`, or image pixels in the context of image classification. The images in CIFAR10 are 32x32 color images of three channels. Therefore, the size of the input data is 3072 (3x32x32), and the number of categories is 10. - ```python - datadim = 3 * 32 * 32 - classdim = 10 - data = data_layer(name='image', size=datadim) - ``` + ```python + datadim = 3 * 32 * 32 + classdim = 10 + data = data_layer(name='image', size=datadim) + ``` 2. Define VGG main module - ```python - net = vgg_bn_drop(data) - ``` + ```python + net = vgg_bn_drop(data) + ``` The input to VGG main module is from data layer. `vgg_bn_drop` defines a 16-layer VGG network, with each convolutional layer followed by BN and dropout layers. Here is the definition in detail: - ```python - def vgg_bn_drop(input, num_channels): - def conv_block(ipt, num_filter, groups, dropouts, num_channels_=None): - return img_conv_group( - input=ipt, - num_channels=num_channels_, - pool_size=2, - pool_stride=2, - conv_num_filter=[num_filter] * groups, - conv_filter_size=3, - conv_act=ReluActivation(), - conv_with_batchnorm=True, - conv_batchnorm_drop_rate=dropouts, - pool_type=MaxPooling()) - - conv1 = conv_block(input, 64, 2, [0.3, 0], 3) - conv2 = conv_block(conv1, 128, 2, [0.4, 0]) - conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) - conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) - conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) - - drop = dropout_layer(input=conv5, dropout_rate=0.5) - fc1 = fc_layer(input=drop, size=512, act=LinearActivation()) - bn = batch_norm_layer( - input=fc1, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5)) - fc2 = fc_layer(input=bn, size=512, act=LinearActivation()) - return fc2 - - ``` + ```python + def vgg_bn_drop(input, num_channels): + def conv_block(ipt, num_filter, groups, dropouts, num_channels_=None): + return img_conv_group( + input=ipt, + num_channels=num_channels_, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act=ReluActivation(), + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type=MaxPooling()) + + conv1 = conv_block(input, 64, 2, [0.3, 0], 3) + conv2 = conv_block(conv1, 128, 2, [0.4, 0]) + conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) + conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) + conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) + + drop = dropout_layer(input=conv5, dropout_rate=0.5) + fc1 = fc_layer(input=drop, size=512, act=LinearActivation()) + bn = batch_norm_layer( + input=fc1, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5)) + fc2 = fc_layer(input=bn, size=512, act=LinearActivation()) + return fc2 + + ``` 2.1. First defines a convolution block or conv_block. The default convolution kernel is 3x3, and the default pooling size is 2x2 with stride 2. Dropout specifies the probability in dropout operation. Function `img_conv_group` is defined in `paddle.trainer_config_helpers` consisting of a series of `Conv->BN->ReLu->Dropout` and a `Pooling`. @@ -344,22 +344,22 @@ First we define VGG network. Since the image size and amount of CIFAR10 are rela The above VGG network extracts high-level features and maps them to a vector of the same size as the categories. Softmax function or classifier is then used for calculating the probability of the image belonging to each category. - ```python - out = fc_layer(input=net, size=class_num, act=SoftmaxActivation()) - ``` + ```python + out = fc_layer(input=net, size=class_num, act=SoftmaxActivation()) + ``` 4. Define Loss Function and Outputs In the context of supervised learning, labels of training images are defined in `data_layer`, too. During training, cross-entropy is used as loss function and as the output of the network; During testing, the outputs are the probabilities calculated in the classifier. - ```python - if not is_predict: - lbl = data_layer(name="label", size=class_num) - cost = classification_cost(input=out, label=lbl) - outputs(cost) - else: - outputs(out) - ``` + ```python + if not is_predict: + lbl = data_layer(name="label", size=class_num) + cost = classification_cost(input=out, label=lbl) + outputs(cost) + else: + outputs(out) + ``` ### ResNet @@ -607,6 +607,6 @@ marked.setOptions({ } }); document.getElementById("context").innerHTML = marked( - document.getElementById("markdown").innerHTML) + document.getElementById("markdown").innerHTML) diff --git a/image_classification/index.html b/image_classification/index.html index 0e48c728c61a2b12aa400f8840c1bc0478bf21df..79262c469d87502090cfa3e986b4904da0df5a0d 100644 --- a/image_classification/index.html +++ b/image_classification/index.html @@ -44,7 +44,7 @@ 本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/image_classification), 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。 -## 背景介绍 +## 背景介绍 图像相比文字能够提供更加生动、容易理解及更具艺术感的信息,是人们转递与交换信息的重要来源。在本教程中,我们专注于图像识别领域的一个重要问题,即图像分类。 @@ -92,7 +92,7 @@ 2). **特征编码**: 底层特征中包含了大量冗余与噪声,为了提高特征表达的鲁棒性,需要使用一种特征变换算法对底层特征进行编码,称作特征编码。常用的特征编码包括向量量化编码 \[[4](#参考文献)\]、稀疏编码 \[[5](#参考文献)\]、局部线性约束编码 \[[6](#参考文献)\]、Fisher向量编码 \[[7](#参考文献)\] 等。 3). **空间特征约束**: 特征编码之后一般会经过空间特征约束,也称作**特征汇聚**。特征汇聚是指在一个空间范围内,对每一维特征取最大值或者平均值,可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法,这种方法提出将图像均匀分块,在分块内做特征汇聚。 4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述,接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器,在传统图像分类任务上性能很好。 - + 这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \[[18](#参考文献)\]。[NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征,两个非线性编码器以及SVM分类器获得图像分类的冠军 \[[8](#参考文献)\]。 Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破,效果大幅度超越传统方法,获得了ILSVRC2012冠军,该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后,涌现了一系列CNN模型,不断地在ImageNet上刷新成绩,如图4展示。随着模型变得越来越深以及精妙的结构设计,Top-5的错误率也越来越低,降到了3.5%附近。而在同样的ImageNet数据集上,人眼的辨识错误率大概在5.1%,也就是目前的深度学习模型的识别能力已经超过了人眼。 @@ -108,8 +108,8 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得


-图5. CNN网络示例[20] -

+图5. CNN网络示例[20] +

- 卷积层(convolution layer): 执行卷积操作提取底层到高层的特征,发掘出图片局部关联性质和空间不变性质。 - 池化层(pooling layer): 执行降采样操作。通过取卷积输出特征图中局部区块的最大值(max-pooling)或者均值(avg-pooling)。降采样也是图像处理中常见的一种操作,可以过滤掉一些不重要的高频信息。 @@ -149,7 +149,7 @@ GoogleNet整体网络结构如图8所示,总共22层网络:开始由3层普


-图8. GoogleNet[12] +图8. GoogleNet[12]

@@ -177,108 +177,35 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类 ## 数据准备 -### 数据介绍与下载 - -通用图像分类公开的标准数据集常用的有[CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等,常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大,如[模型概览](#模型概览)一章所讲,大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化,常用的是ImageNet-2012数据集,该数据集包含1000个类别:训练集包含1,281,167张图片,每个类别数据732至1300张不等,验证集包含50,000张图片,平均每个类别50张图片。 +通用图像分类公开的标准数据集常用的有[CIFAR]()数据集。CIFAR10数据集包含60,000张32x32的彩色图片,10个类别,每个类包含6,000张。其中50,000张图片作为训练集,10000张作为测试集。图11从每个类别中随机抽取了10张图片,展示了所有的类别。


图11. CIFAR10数据集[21]

-下面命令用于下载数据和基于训练集计算图像均值,在网络输入前,基于该均值对输入数据做预处理。 - -```bash -./data/get_data.sh -``` +Paddle API提供了自动加载cifar数据集模块 `paddle.dataset.cifar`。 -### 数据提供给PaddlePaddle +通过输入`python train.py`,就可以开始训练模型了,以下小节将详细介绍`train.py`的相关内容。 -我们使用Python接口传递数据给系统,下面 `dataprovider.py` 针对CIFAR10数据给出了完整示例。 - -- `initializer` 函数进行dataprovider的初始化,这里加载图像的均值,定义了输入image和label两个字段的类型。 +### 模型结构 -- `process` 函数将数据逐条传输给系统,在图像分类任务里,可以在该函数中完成数据扰动操作,再传输给PaddlePaddle。这里对训练集做随机左右翻转,并将原始图片减去均值后传输给系统。 +#### Paddle 初始化 +通过 `paddle.init`,初始化Paddle是否使用GPU,trainer的数目等等。 ```python -import numpy as np -import cPickle -from paddle.trainer.PyDataProvider2 import * - -def initializer(settings, mean_path, is_train, **kwargs): - settings.is_train = is_train - settings.input_size = 3 * 32 * 32 - settings.mean = np.load(mean_path)['mean'] - settings.input_types = { - 'image': dense_vector(settings.input_size), - 'label': integer_value(10) - } - - -@provider(init_hook=initializer, pool_size=50000) -def process(settings, file_list): - with open(file_list, 'r') as fdata: - for fname in fdata: - fo = open(fname.strip(), 'rb') - batch = cPickle.load(fo) - fo.close() - images = batch['data'] - labels = batch['labels'] - for im, lab in zip(images, labels): - if settings.is_train and np.random.randint(2): - im = im.reshape(3, 32, 32) - im = im[:,:,::-1] - im = im.flatten() - im = im - settings.mean - yield { - 'image': im.astype('float32'), - 'label': int(lab) - } -``` - -## 模型配置说明 +import sys +import paddle.v2 as paddle +from vgg import vgg_bn_drop +from resnet import resnet_cifar10 -### 数据定义 - -在模型配置中,定义通过 `define_py_data_sources2` 函数从 dataprovider 中读入数据, 其中 args 指定均值文件的路径。如果该配置文件用于预测,则不需要数据定义部分。 - -```python -from paddle.trainer_config_helpers import * - -is_predict = get_config_arg("is_predict", bool, False) -if not is_predict: - define_py_data_sources2( - train_list='data/train.list', - test_list='data/test.list', - module='dataprovider', - obj='process', - args={'mean_path': 'data/mean.meta'}) +# PaddlePaddle init +paddle.init(use_gpu=False, trainer_count=1) ``` -### 算法配置 - -在模型配置中,通过 `settings` 设置训练使用的优化算法,并指定batch size 、初始学习率、momentum以及L2正则。 - -```python -settings( - batch_size=128, - learning_rate=0.1 / 128.0, - learning_rate_decay_a=0.1, - learning_rate_decay_b=50000 * 100, - learning_rate_schedule='discexp', - learning_method=MomentumOptimizer(0.9), - regularization=L2Regularization(0.0005 * 128),) -``` - -通过 `learning_rate_decay_a` (简写$a$) 、`learning_rate_decay_b` (简写$b$) 和 `learning_rate_schedule` 指定学习率调整策略,这里采用离散指数的方式调节学习率,计算公式如下, $n$ 代表已经处理过的累计总样本数,$lr_{0}$ 即为 `settings` 里设置的 `learning_rate`。 - -$$ lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$ - -### 模型结构 - 本教程中我们提供了VGG和ResNet两个模型的配置。 #### VGG @@ -287,77 +214,79 @@ $$ lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$ 1. 定义数据输入及其维度 - 网络输入定义为 `data_layer` (数据层),在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图,因此输入数据大小为3072(3x32x32),类别大小为10,即10分类。 - - ```python - datadim = 3 * 32 * 32 - classdim = 10 - data = data_layer(name='image', size=datadim) - ``` + 网络输入定义为 `data_layer` (数据层),在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图,因此输入数据大小为3072(3x32x32),类别大小为10,即10分类。 + + ```python + datadim = 3 * 32 * 32 + classdim = 10 + + image = paddle.layer.data( + name="image", type=paddle.data_type.dense_vector(datadim)) + ``` 2. 定义VGG网络核心模块 - ```python - net = vgg_bn_drop(data) - ``` - VGG核心模块的输入是数据层,`vgg_bn_drop` 定义了16层VGG结构,每层卷积后面引入BN层和Dropout层,详细的定义如下: - - ```python - def vgg_bn_drop(input, num_channels): - def conv_block(ipt, num_filter, groups, dropouts, num_channels_=None): - return img_conv_group( - input=ipt, - num_channels=num_channels_, - pool_size=2, - pool_stride=2, - conv_num_filter=[num_filter] * groups, - conv_filter_size=3, - conv_act=ReluActivation(), - conv_with_batchnorm=True, - conv_batchnorm_drop_rate=dropouts, - pool_type=MaxPooling()) - - conv1 = conv_block(input, 64, 2, [0.3, 0], 3) - conv2 = conv_block(conv1, 128, 2, [0.4, 0]) - conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) - conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) - conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) - - drop = dropout_layer(input=conv5, dropout_rate=0.5) - fc1 = fc_layer(input=drop, size=512, act=LinearActivation()) - bn = batch_norm_layer( - input=fc1, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5)) - fc2 = fc_layer(input=bn, size=512, act=LinearActivation()) - return fc2 - - ``` - - 2.1. 首先定义了一组卷积网络,即conv_block。卷积核大小为3x3,池化窗口大小为2x2,窗口滑动大小为2,groups决定每组VGG模块是几次连续的卷积操作,dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.trainer_config_helpers`中预定义的模块,由若干组 `Conv->BN->ReLu->Dropout` 和 一组 `Pooling` 组成, - - 2.2. 五组卷积操作,即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0,即不使用Dropout操作。 - - 2.3. 最后接两层512维的全连接。 + ```python + net = vgg_bn_drop(image) + ``` + VGG核心模块的输入是数据层,`vgg_bn_drop` 定义了16层VGG结构,每层卷积后面引入BN层和Dropout层,详细的定义如下: + + ```python + def vgg_bn_drop(input): + def conv_block(ipt, num_filter, groups, dropouts, num_channels=None): + return paddle.networks.img_conv_group( + input=ipt, + num_channels=num_channels, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act=paddle.activation.Relu(), + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type=paddle.pooling.Max()) + + conv1 = conv_block(input, 64, 2, [0.3, 0], 3) + conv2 = conv_block(conv1, 128, 2, [0.4, 0]) + conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) + conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) + conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) + + drop = paddle.layer.dropout(input=conv5, dropout_rate=0.5) + fc1 = paddle.layer.fc(input=drop, size=512, act=paddle.activation.Linear()) + bn = paddle.layer.batch_norm( + input=fc1, + act=paddle.activation.Relu(), + layer_attr=paddle.attr.Extra(drop_rate=0.5)) + fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear()) + return fc2 + ``` + + 2.1. 首先定义了一组卷积网络,即conv_block。卷积核大小为3x3,池化窗口大小为2x2,窗口滑动大小为2,groups决定每组VGG模块是几次连续的卷积操作,dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块,由若干组 `Conv->BN->ReLu->Dropout` 和 一组 `Pooling` 组成, + + 2.2. 五组卷积操作,即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0,即不使用Dropout操作。 + + 2.3. 最后接两层512维的全连接。 3. 定义分类器 - 通过上面VGG网络提取高层特征,然后经过全连接层映射到类别维度大小的向量,再通过Softmax归一化得到每个类别的概率,也可称作分类器。 + 通过上面VGG网络提取高层特征,然后经过全连接层映射到类别维度大小的向量,再通过Softmax归一化得到每个类别的概率,也可称作分类器。 - ```python - out = fc_layer(input=net, size=class_num, act=SoftmaxActivation()) - ``` + ```python + out = paddle.layer.fc(input=net, + size=classdim, + act=paddle.activation.Softmax()) + ``` 4. 定义损失函数和网络输出 - 在有监督训练中需要输入图像对应的类别信息,同样通过`data_layer`来定义。训练中采用多类交叉熵作为损失函数,并作为网络的输出,预测阶段定义网络的输出为分类器得到的概率信息。 - - ```python - if not is_predict: - lbl = data_layer(name="label", size=class_num) - cost = classification_cost(input=out, label=lbl) - outputs(cost) - else: - outputs(out) - ``` + 在有监督训练中需要输入图像对应的类别信息,同样通过`paddle.layer.data`来定义。训练中采用多类交叉熵作为损失函数,并作为网络的输出,预测阶段定义网络的输出为分类器得到的概率信息。 + + ```python + lbl = paddle.layer.data( + name="label", type=paddle.data_type.integer_value(classdim)) + cost = paddle.layer.classification_cost(input=out, label=lbl) + ``` ### ResNet @@ -381,158 +310,178 @@ def conv_bn_layer(input, filter_size, stride, padding, - active_type=ReluActivation(), + active_type=paddle.activation.Relu(), ch_in=None): - tmp = img_conv_layer( + tmp = paddle.layer.img_conv( input=input, filter_size=filter_size, num_channels=ch_in, num_filters=ch_out, stride=stride, padding=padding, - act=LinearActivation(), + act=paddle.activation.Linear(), bias_attr=False) - return batch_norm_layer(input=tmp, act=active_type) - + return paddle.layer.batch_norm(input=tmp, act=active_type) def shortcut(ipt, n_in, n_out, stride): if n_in != n_out: - return conv_bn_layer(ipt, n_out, 1, stride, 0, LinearActivation()) + return conv_bn_layer(ipt, n_out, 1, stride, 0, + paddle.activation.Linear()) else: return ipt def basicblock(ipt, ch_out, stride): - ch_in = ipt.num_filters + ch_in = ch_out * 2 tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1) - tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, LinearActivation()) - short = shortcut(ipt, ch_in, ch_out, stride) - return addto_layer(input=[ipt, short], act=ReluActivation()) - -def bottleneck(ipt, ch_out, stride): - ch_in = ipt.num_filter - tmp = conv_bn_layer(ipt, ch_out, 1, stride, 0) - tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1) - tmp = conv_bn_layer(tmp, ch_out * 4, 1, 1, 0, LinearActivation()) + tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear()) short = shortcut(ipt, ch_in, ch_out, stride) - return addto_layer(input=[ipt, short], act=ReluActivation()) + return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu()) def layer_warp(block_func, ipt, features, count, stride): tmp = block_func(ipt, features, stride) for i in range(1, count): tmp = block_func(tmp, features, 1) return tmp - ``` `resnet_cifar10` 的连接结构主要有以下几个过程。 -1. 底层输入连接一层 `conv_bn_layer`,即带BN的卷积层。 +1. 底层输入连接一层 `conv_bn_layer`,即带BN的卷积层。 2. 然后连接3组残差模块即下面配置3组 `layer_warp` ,每组采用图 10 左边残差模块组成。 -3. 最后对网络做均值池化并返回该层。 +3. 最后对网络做均值池化并返回该层。 注意:除过第一层卷积层和最后一层全连接层之外,要求三组 `layer_warp` 总的含参层数能够被6整除,即 `resnet_cifar10` 的 depth 要满足 $(depth - 2) % 6 == 0$ 。 ```python -def resnet_cifar10(ipt, depth=56): +def resnet_cifar10(ipt, depth=32): # depth should be one of 20, 32, 44, 56, 110, 1202 assert (depth - 2) % 6 == 0 n = (depth - 2) / 6 nStages = {16, 64, 128} - conv1 = conv_bn_layer(ipt, - ch_in=3, - ch_out=16, - filter_size=3, - stride=1, - padding=1) + conv1 = conv_bn_layer( + ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1) res1 = layer_warp(basicblock, conv1, 16, n, 1) res2 = layer_warp(basicblock, res1, 32, n, 2) res3 = layer_warp(basicblock, res2, 64, n, 2) - pool = img_pool_layer(input=res3, - pool_size=8, - stride=1, - pool_type=AvgPooling()) + pool = paddle.layer.img_pool( + input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg()) return pool ``` -## 模型训练 +## 训练模型 + +### 定义参数 -执行脚本 train.sh 进行模型训练, 其中指定配置文件、设备类型、线程个数、总共训练的轮数、模型存储路径等。 +首先依据模型配置的`cost`定义模型参数。 -``` bash -sh train.sh +```python +# Create parameters +parameters = paddle.parameters.create(cost) ``` -脚本 `train.sh` 如下: - -```bash -#cfg=models/resnet.py -cfg=models/vgg.py -output=output -log=train.log - -paddle train \ - --config=$cfg \ - --use_gpu=true \ - --trainer_count=1 \ - --log_period=100 \ - --num_passes=300 \ - --save_dir=$output \ - 2>&1 | tee $log +可以打印参数名字,如果在网络配置中没有指定名字,则默认生成。 + +```python +print parameters.keys() ``` -- `--config=$cfg` : 指定配置文件,默认是 `models/vgg.py`。 -- `--use_gpu=true` : 指定使用GPU训练,若使用CPU,设置为false。 -- `--trainer_count=1` : 指定线程个数或GPU个数。 -- `--log_period=100` : 指定日志打印的batch间隔。 -- `--save_dir=$output` : 指定模型存储路径。 +### 构造训练(Trainer) -一轮训练log示例如下所示,经过1个pass, 训练集上平均error为0.79958 ,测试集上平均error为0.7858 。 +根据网络拓扑结构和模型参数来构造出trainer用来训练,在构造时还需指定优化方法,这里使用最基本的Momentum方法,同时设定了学习率、正则等。 -```text -TrainerInternal.cpp:165] Batch=300 samples=38400 AvgCost=2.07708 CurrentCost=1.96158 Eval: classification_error_evaluator=0.81151 CurrentEval: classification_error_evaluator=0.789297 -TrainerInternal.cpp:181] Pass=0 Batch=391 samples=50000 AvgCost=2.03348 Eval: classification_error_evaluator=0.79958 -Tester.cpp:115] Test samples=10000 cost=1.99246 Eval: classification_error_evaluator=0.7858 +```python +# Create optimizer +momentum_optimizer = paddle.optimizer.Momentum( + momentum=0.9, + regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128), + learning_rate=0.1 / 128.0, + learning_rate_decay_a=0.1, + learning_rate_decay_b=50000 * 100, + learning_rate_schedule='discexp', + batch_size=128) + +# Create trainer +trainer = paddle.trainer.SGD(cost=cost, + parameters=parameters, + update_equation=momentum_optimizer) ``` -图12是训练的分类错误率曲线图,运行到第200个pass后基本收敛,最终得到测试集上分类错误率为8.54%。 +通过 `learning_rate_decay_a` (简写$a$) 、`learning_rate_decay_b` (简写$b$) 和 `learning_rate_schedule` 指定学习率调整策略,这里采用离散指数的方式调节学习率,计算公式如下, $n$ 代表已经处理过的累计总样本数,$lr_{0}$ 即为 `settings` 里设置的 `learning_rate`。 -

-
-图12. CIFAR10数据集上VGG模型的分类错误率 -

+$$ lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$ -## 模型应用 -在训练完成后,模型会保存在路径 `output/pass-%05d` 下,例如第300个pass的模型会保存在路径 `output/pass-00299`。 可以使用脚本 `classify.py` 对图片进行预测或提取特征,注意该脚本默认使用模型配置为 `models/vgg.py`, +### 训练 +cifar.train10()每次产生一条样本,在完成shuffle和batch之后,作为训练的输入。 -### 预测 +```python +reader=paddle.reader.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10(), buf_size=50000), + batch_size=128) +``` -可以按照下面方式预测图片的类别,默认使用GPU预测,如果使用CPU预测,在后面加参数 `-c`即可。 +通过`feeding`来指定每一个数据和`paddle.layer.data`的对应关系。例如: `cifar.train10()`产生数据的第0列对应image层的特征。 -```bash -python classify.py --job=predict --model=output/pass-00299 --data=image/dog.png # -c +```python +feeding={'image': 0, + 'label': 1} ``` -预测结果为: +可以使用`event_handler`回调函数来观察训练过程,或进行测试等, 该回调函数是`trainer.train`函数里设定。 -```text -Label of image/dog.png is: 5 +```python +# End batch and end pass event handler +def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "\nPass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + else: + sys.stdout.write('.') + sys.stdout.flush() + if isinstance(event, paddle.event.EndPass): + result = trainer.test( + reader=paddle.reader.batch( + paddle.dataset.cifar.test10(), batch_size=128), + reader_dict={'image': 0, + 'label': 1}) + print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) ``` -### 特征提取 +通过`trainer.train`函数训练: + +```python +trainer.train( + reader=reader, + num_passes=200, + event_handler=event_handler, + feeding=feeding) +``` -可以按照下面方式对图片提取特征,和预测使用方式不同的是指定job类型为extract,并需要指定提取的层。`classify.py` 默认以第一层卷积特征为例提取特征,并画出了类似图13的可视化图。VGG模型的第一层卷积有64个通道,图13展示了每个通道的灰度图。 +一轮训练log示例如下所示,经过1个pass, 训练集上平均error为0.6875 ,测试集上平均error为0.8852 。 -```bash -python classify.py --job=extract --model=output/pass-00299 --data=image/dog.png # -c +```text +Pass 0, Batch 0, Cost 2.473182, {'classification_error_evaluator': 0.9140625} +................................................................................................... +Pass 0, Batch 100, Cost 1.913076, {'classification_error_evaluator': 0.78125} +................................................................................................... +Pass 0, Batch 200, Cost 1.783041, {'classification_error_evaluator': 0.7421875} +................................................................................................... +Pass 0, Batch 300, Cost 1.668833, {'classification_error_evaluator': 0.6875} +.......................................................................................... +Test with Pass 0, {'classification_error_evaluator': 0.885200023651123} ``` +图12是训练的分类错误率曲线图,运行到第200个pass后基本收敛,最终得到测试集上分类错误率为8.54%。 +

-
-图13. 卷积特征可视化图 +
+图12. CIFAR10数据集上VGG模型的分类错误率

+ ## 总结 传统图像分类方法由多个阶段构成,框架较为复杂,而端到端的CNN模型结构可一步到位,而且大幅度提升了分类准确率。本文我们首先介绍VGG、GoogleNet、ResNet三个经典的模型;然后基于CIFAR10数据集,介绍如何使用PaddlePaddle配置和训练CNN模型,尤其是VGG和ResNet模型;最后介绍如何使用PaddlePaddle的API接口对图片进行预测和特征提取。对于其他数据集比如ImageNet,配置和训练流程是同样的,大家可以自行进行实验。 @@ -544,7 +493,7 @@ python classify.py --job=extract --model=output/pass-00299 --data=image/dog.png [2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005. -[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28. +[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28. [4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003. @@ -604,6 +553,6 @@ marked.setOptions({ } }); document.getElementById("context").innerHTML = marked( - document.getElementById("markdown").innerHTML) + document.getElementById("markdown").innerHTML) diff --git a/image_detection/index.html b/image_detection/index.html index bd5f85aff99e291b01dc091f7a3d4ac622bce4a6..50660d51f763d0675fa13d770a92bf5242f63788 100644 --- a/image_detection/index.html +++ b/image_detection/index.html @@ -57,6 +57,6 @@ marked.setOptions({ } }); document.getElementById("context").innerHTML = marked( - document.getElementById("markdown").innerHTML) + document.getElementById("markdown").innerHTML) diff --git a/image_qa/index.html b/image_qa/index.html index bd5f85aff99e291b01dc091f7a3d4ac622bce4a6..50660d51f763d0675fa13d770a92bf5242f63788 100644 --- a/image_qa/index.html +++ b/image_qa/index.html @@ -57,6 +57,6 @@ marked.setOptions({ } }); document.getElementById("context").innerHTML = marked( - document.getElementById("markdown").innerHTML) + document.getElementById("markdown").innerHTML) diff --git a/index.html b/index.html index 8e83e91e087fdf59bebf41d11988c49e2b813e01..38b9457f8034cb6bb381e88843ad60e2092b49ad 100644 --- a/index.html +++ b/index.html @@ -1,7 +1,7 @@ - + - Please access github home page + Please access github home page diff --git a/label_semantic_roles/README.api.md b/label_semantic_roles/README.api.md deleted file mode 100644 index 635742b5a1d67144a078ee1ebd3d717eb10e83f7..0000000000000000000000000000000000000000 --- a/label_semantic_roles/README.api.md +++ /dev/null @@ -1,466 +0,0 @@ -# 语义角色标注 - -本教程源代码目录在[book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/label_semantic_roles), 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。 - -## 背景介绍 - -自然语言分析技术大致分为三个层面:词法分析、句法分析和语义分析。语义角色标注是实现浅层语义分析的一种方式。在一个句子中,谓词是对主语的陈述或说明,指出“做什么”、“是什么”或“怎么样,代表了一个事件的核心,跟谓词搭配的名词称为论元。语义角色是指论元在动词所指事件中担任的角色。主要有:施事者(Agent)、受事者(Patient)、客体(Theme)、经验者(Experiencer)、受益者(Beneficiary)、工具(Instrument)、处所(Location)、目标(Goal)和来源(Source)等。 - -请看下面的例子,“遇到” 是谓词(Predicate,通常简写为“Pred”),“小明”是施事者(Agent),“小红”是受事者(Patient),“昨天” 是事件发生的时间(Time),“公园”是事情发生的地点(Location)。 - -$$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mbox{Time}\mbox{在[公园]}_{\mbox{Location}}\mbox{[遇到]}_{\mbox{Predicate}}\mbox{了[小红]}_{\mbox{Patient}}\mbox{。}$$ - -语义角色标注(Semantic Role Labeling,SRL)以句子的谓词为中心,不对句子所包含的语义信息进行深入分析,只分析句子中各成分与谓词之间的关系,即句子的谓词(Predicate)- 论元(Argument)结构,并用语义角色来描述这些结构关系,是许多自然语言理解任务(如信息抽取,篇章分析,深度问答等)的一个重要中间步骤。在研究中一般都假定谓词是给定的,所要做的就是找出给定谓词的各个论元和它们的语义角色。 - -传统的SRL系统大多建立在句法分析基础之上,通常包括5个流程: - -1. 构建一棵句法分析树,例如,图1是对上面例子进行依存句法分析得到的一棵句法树。 -2. 从句法树上识别出给定谓词的候选论元。 -3. 候选论元剪除;一个句子中的候选论元可能很多,候选论元剪除就是从大量的候选项中剪除那些最不可能成为论元的候选项。 -4. 论元识别:这个过程是从上一步剪除之后的候选中判断哪些是真正的论元,通常当做一个二分类问题来解决。 -5. 对第4步的结果,通过多分类得到论元的语义角色标签。可以看到,句法分析是基础,并且后续步骤常常会构造的一些人工特征,这些特征往往也来自句法分析。 - -
-
-图1. 依存句法分析句法树示例 -
- -然而,完全句法分析需要确定句子所包含的全部句法信息,并确定句子各成分之间的关系,是一个非常困难的任务,目前技术下的句法分析准确率并不高,句法分析的细微错误都会导致SRL的错误。为了降低问题的复杂度,同时获得一定的句法结构信息,“浅层句法分析”的思想应运而生。浅层句法分析也称为部分句法分析(partial parsing)或语块划分(chunking)。和完全句法分析得到一颗完整的句法树不同,浅层句法分析只需要识别句子中某些结构相对简单的独立成分,例如:动词短语,这些被识别出来的结构称为语块。为了回避 “无法获得准确率较高的句法树” 所带来的困难,一些研究\[[1](#参考文献)\]也提出了基于语块(chunk)的SRL方法。基于语块的SRL方法将SRL作为一个序列标注问题来解决。序列标注任务一般都会采用BIO表示方式来定义序列标注的标签集,我们先来介绍这种表示方法。在BIO表示法中,B代表语块的开始,I代表语块的中间,O代表语块结束。通过B、I、O 三种标记将不同的语块赋予不同的标签,例如:对于一个角色为A的论元,将它所包含的第一个语块赋予标签B-A,将它所包含的其它语块赋予标签I-A,不属于任何论元的语块赋予标签O。 - -我们继续以上面的这句话为例,图1展示了BIO表示方法。 - -
-
-图2. BIO标注方法示例 -
- -从上面的例子可以看到,根据序列标注结果可以直接得到论元的语义角色标注结果,是一个相对简单的过程。这种简单性体现在:(1)依赖浅层句法分析,降低了句法分析的要求和难度;(2)没有了候选论元剪除这一步骤;(3)论元的识别和论元标注是同时实现的。这种一体化处理论元识别和论元标注的方法,简化了流程,降低了错误累积的风险,往往能够取得更好的结果。 - -与基于语块的SRL方法类似,在本教程中我们也将SRL看作一个序列标注问题,不同的是,我们只依赖输入文本序列,不依赖任何额外的语法解析结果或是复杂的人造特征,利用深度神经网络构建一个端到端学习的SRL系统。我们以[CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/)任务中SRL任务的公开数据集为例,实践下面的任务:给定一句话和这句话里的一个谓词,通过序列标注的方式,从句子中找到谓词对应的论元,同时标注它们的语义角色。 - -## 模型概览 - -循环神经网络(Recurrent Neural Network)是一种对序列建模的重要模型,在自然语言处理任务中有着广泛地应用。不同于前馈神经网络(Feed-forward Neural Network),RNN能够处理输入之间前后关联的问题。LSTM是RNN的一种重要变种,常用来学习长序列中蕴含的长程依赖关系,我们在[情感分析](https://github.com/PaddlePaddle/book/tree/develop/understand_sentiment)一篇中已经介绍过,这一篇中我们依然利用LSTM来解决SRL问题。 - -### 栈式循环神经网络(Stacked Recurrent Neural Network) - -深层网络有助于形成层次化特征,网络上层在下层已经学习到的初级特征基础上,形成更复杂的高级特征。尽管LSTM沿时间轴展开后等价于一个非常“深”的前馈网络,但由于LSTM各个时间步参数共享,$t-1$时刻状态到$t$时刻的映射,始终只经过了一次非线性映射,也就是说单层LSTM对状态转移的建模是 “浅” 的。堆叠多个LSTM单元,令前一个LSTM$t$时刻的输出,成为下一个LSTM单元$t$时刻的输入,帮助我们构建起一个深层网络,我们把它称为第一个版本的栈式循环神经网络。深层网络提高了模型拟合复杂模式的能力,能够更好地建模跨不同时间步的模式\[[2](#参考文献)\]。 - -然而,训练一个深层LSTM网络并非易事。纵向堆叠多个LSTM单元可能遇到梯度在纵向深度上传播受阻的问题。通常,堆叠4层LSTM单元可以正常训练,当层数达到4~8层时,会出现性能衰减,这时必须考虑一些新的结构以保证梯度纵向顺畅传播,这是训练深层LSTM网络必须解决的问题。我们可以借鉴LSTM解决 “梯度消失梯度爆炸” 问题的智慧之一:在记忆单元(Memory Cell)这条信息传播的路线上没有非线性映射,当梯度反向传播时既不会衰减、也不会爆炸。因此,深层LSTM模型也可以在纵向上添加一条保证梯度顺畅传播的路径。 - -一个LSTM单元完成的运算可以被分为三部分:(1)输入到隐层的映射(input-to-hidden) :每个时间步输入信息$x$会首先经过一个矩阵映射,再作为遗忘门,输入门,记忆单元,输出门的输入,注意,这一次映射没有引入非线性激活;(2)隐层到隐层的映射(hidden-to-hidden):这一步是LSTM计算的主体,包括遗忘门,输入门,记忆单元更新,输出门的计算;(3)隐层到输出的映射(hidden-to-output):通常是简单的对隐层向量进行激活。我们在第一个版本的栈式网络的基础上,加入一条新的路径:除上一层LSTM输出之外,将前层LSTM的输入到隐层的映射作为的一个新的输入,同时加入一个线性映射去学习一个新的变换。 - -图3是最终得到的栈式循环神经网络结构示意图。 - -

-
-图3. 基于LSTM的栈式循环神经网络结构示意图 -

- -### 双向循环神经网络(Bidirectional Recurrent Neural Network) - -在LSTM中,$t$时刻的隐藏层向量编码了到$t$时刻为止所有输入的信息,但$t$时刻的LSTM可以看到历史,却无法看到未来。在绝大多数自然语言处理任务中,我们几乎总是能拿到整个句子。这种情况下,如果能够像获取历史信息一样,得到未来的信息,对序列学习任务会有很大的帮助。 - -为了克服这一缺陷,我们可以设计一种双向循环网络单元,它的思想简单且直接:对上一节的栈式循环神经网络进行一个小小的修改,堆叠多个LSTM单元,让每一层LSTM单元分别以:正向、反向、正向 …… 的顺序学习上一层的输出序列。于是,从第2层开始,$t$时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。 - -

-
-图4. 基于LSTM的双向循环神经网络结构示意图 -

- -需要说明的是,这种双向RNN结构和Bengio等人在机器翻译任务中使用的双向RNN结构\[[3](#参考文献), [4](#参考文献)\] 并不相同,我们会在后续[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/machine_translation/README.md)任务中,介绍另一种双向循环神经网络。 - -### 条件随机场 (Conditional Random Field) - -使用神经网络模型解决问题的思路通常是:前层网络学习输入的特征表示,网络的最后一层在特征基础上完成最终的任务。在SRL任务中,深层LSTM网络学习输入的特征表示,条件随机场(Conditional Random Filed, CRF)在特征的基础上完成序列标注,处于整个网络的末端。 - -CRF是一种概率化结构模型,可以看作是一个概率无向图模型,结点表示随机变量,边表示随机变量之间的概率依赖关系。简单来讲,CRF学习条件概率$P(X|Y)$,其中 $X = (x_1, x_2, ... , x_n)$ 是输入序列,$Y = (y_1, y_2, ... , y_n)$ 是标记序列;解码过程是给定 $X$序列求解令$P(Y|X)$最大的$Y$序列,即$Y^* = \mbox{arg max}_{Y} P(Y | X)$。 - -序列标注任务只需要考虑输入和输出都是一个线性序列,并且由于我们只是将输入序列作为条件,不做任何条件独立假设,因此输入序列的元素之间并不存在图结构。综上,在序列标注任务中使用的是如图5所示的定义在链式图上的CRF,称之为线性链条件随机场(Linear Chain Conditional Random Field)。 - -

-
-图5. 序列标注任务中使用的线性链条件随机场 -

- -根据线性链条件随机场上的因子分解定理\[[5](#参考文献)\],在给定观测序列$X$时,一个特定标记序列$Y$的概率可以定义为: - -$$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$ - -其中$Z(X)$是归一化因子,$t_j$ 是定义在边上的特征函数,依赖于当前和前一个位置,称为转移特征,表示对于输入序列$X$及其标注序列在 $i$及$i - 1$位置上标记的转移概率。$s_k$是定义在结点上的特征函数,称为状态特征,依赖于当前位置,表示对于观察序列$X$及其$i$位置的标记概率。$\lambda_j$ 和 $\mu_k$ 分别是转移特征函数和状态特征函数对应的权值。实际上,$t$和$s$可以用相同的数学形式表示,再对转移特征和状态特在各个位置$i$求和有:$f_{k}(Y, X) = \sum_{i=1}^{n}f_k({y_{i - 1}, y_i, X, i})$,把$f$统称为特征函数,于是$P(Y|X)$可表示为: - -$$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$ - -$\omega$是特征函数对应的权值,是CRF模型要学习的参数。训练时,对于给定的输入序列和对应的标记序列集合$D = \left[(X_1, Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$ ,通过正则化的极大似然估计,求解如下优化目标: - -$$L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$ - -这个优化目标可以通过反向传播算法和整个神经网络一起求解。解码时,对于给定的输入序列$X$,通过解码算法(通常有:维特比算法、Beam Search)求令出条件概率$\bar{P}(Y|X)$最大的输出序列 $\bar{Y}$。 - -### 深度双向LSTM(DB-LSTM)SRL模型 - -在SRL任务中,输入是 “谓词” 和 “一句话”,目标是从这句话中找到谓词的论元,并标注论元的语义角色。如果一个句子含有$n$个谓词,这个句子会被处理$n$次。一个最为直接的模型是下面这样: - -1. 构造输入; - - 输入1是谓词,输入2是句子 - - 将输入1扩展成和输入2一样长的序列,用one-hot方式表示; -2. one-hot方式的谓词序列和句子序列通过词表,转换为实向量表示的词向量序列; -3. 将步骤2中的2个词向量序列作为双向LSTM的输入,学习输入序列的特征表示; -4. CRF以步骤3中模型学习到的特征为输入,以标记序列为监督信号,实现序列标注; - -大家可以尝试上面这种方法。这里,我们提出一些改进,引入两个简单但对提高系统性能非常有效的特征: - -- 谓词上下文:上面的方法中,只用到了谓词的词向量表达谓词相关的所有信息,这种方法始终是非常弱的,特别是如果谓词在句子中出现多次,有可能引起一定的歧义。从经验出发,谓词前后若干个词的一个小片段,能够提供更丰富的信息,帮助消解歧义。于是,我们把这样的经验也添加到模型中,为每个谓词同时抽取一个“谓词上下文” 片段,也就是从这个谓词前后各取$n$个词构成的一个窗口片段; -- 谓词上下文区域标记:为句子中的每一个词引入一个0-1二值变量,表示它们是否在“谓词上下文”片段中; - -修改后的模型如下(图6是一个深度为4的模型结构示意图): - -1. 构造输入 - - 输入1是句子序列,输入2是谓词序列,输入3是谓词上下文,从句子中抽取这个谓词前后各$n$个词,构成谓词上下文,用one-hot方式表示,输入4是谓词上下文区域标记,标记了句子中每一个词是否在谓词上下文中; - - 将输入2~3均扩展为和输入1一样长的序列; -2. 输入1~4均通过词表取词向量转换为实向量表示的词向量序列;其中输入1、3共享同一个词表,输入2和4各自独有词表; -3. 第2步的4个词向量序列作为双向LSTM模型的输入;LSTM模型学习输入序列的特征表示,得到新的特性表示序列; -4. CRF以第3步中LSTM学习到的特征为输入,以标记序列为监督信号,完成序列标注; - -
-
-图6. SRL任务上的深层双向LSTM模型 -
- - -## 数据介绍 - -在此教程中,我们选用[CoNLL 2005](http://www.cs.upc.edu/~srlconll/)SRL任务开放出的数据集作为示例。需要特别说明的是,CoNLL 2005 SRL任务的训练数集和开发集在比赛之后并非免费进行公开,目前,能够获取到的只有测试集,包括Wall Street Journal的23节和Brown语料集中的3节。在本教程中,我们以测试集中的WSJ数据为训练集来讲解模型。但是,由于测试集中样本的数量远远不够,如果希望训练一个可用的神经网络SRL系统,请考虑付费获取全量数据。 - -原始数据中同时包括了词性标注、命名实体识别、语法解析树等多种信息。本教程中,我们使用test.wsj文件夹中的数据进行训练和测试,并只会用到words文件夹(文本序列)和props文件夹(标注结果)下的数据。本教程使用的数据目录如下: - -```text -conll05st-release/ -└── test.wsj - ├── props # 标注结果 - └── words # 输入文本序列 -``` - -标注信息源自Penn TreeBank\[[7](#参考文献)\]和PropBank\[[8](#参考文献)\]的标注结果。PropBank标注结果的标签和我们在文章一开始示例中使用的标注结果标签不同,但原理是相同的,关于标注结果标签含义的说明,请参考论文\[[9](#参考文献)\]。 - -原始数据需要进行数据预处理才能被PaddlePaddle处理,预处理包括下面几个步骤: - -1. 将文本序列和标记序列其合并到一条记录中; -2. 一个句子如果含有$n$个谓词,这个句子会被处理$n$次,变成$n$条独立的训练样本,每个样本一个不同的谓词; -3. 抽取谓词上下文和构造谓词上下文区域标记; -4. 构造以BIO法表示的标记; -5. 依据词典获取词对应的整数索引。 - - -```python -# import paddle.v2.dataset.conll05 as conll05 -# conll05.corpus_reader函数完成上面第1步和第2步. -# conll05.reader_creator函数完成上面第3步到第5步. -# conll05.test函数可以获取处理之后的每条样本来供PaddlePaddle训练. -``` - -预处理完成之后一条训练样本包含9个特征,分别是:句子序列、谓词、谓词上下文(占 5 列)、谓词上下区域标志、标注序列。下表是一条训练样本的示例。 - -| 句子序列 | 谓词 | 谓词上下文(窗口 = 5) | 谓词上下文区域标记 | 标注序列 | -|---|---|---|---|---| -| A | set | n't been set . × | 0 | B-A1 | -| record | set | n't been set . × | 0 | I-A1 | -| date | set | n't been set . × | 0 | I-A1 | -| has | set | n't been set . × | 0 | O | -| n't | set | n't been set . × | 1 | B-AM-NEG | -| been | set | n't been set . × | 1 | O | -| set | set | n't been set . × | 1 | B-V | -| . | set | n't been set . × | 1 | O | - - -除数据之外,我们同时提供了以下资源: - -| 文件名称 | 说明 | -|---|---| -| word_dict | 输入句子的词典,共计44068个词 | -| label_dict | 标记的词典,共计106个标记 | -| predicate_dict | 谓词的词典,共计3162个词 | -| emb | 一个训练好的词表,32维 | - -我们在英文维基百科上训练语言模型得到了一份词向量用来初始化SRL模型。在SRL模型训练过程中,词向量不再被更新。关于语言模型和词向量可以参考[词向量](https://github.com/PaddlePaddle/book/blob/develop/word2vec/README.md) 这篇教程。我们训练语言模型的语料共有995,000,000个token,词典大小控制为4900,000词。CoNLL 2005训练语料中有5%的词不在这4900,000个词中,我们将它们全部看作未登录词,用``表示。 - -获取词典,打印词典大小: - -```python -import paddle.v2 as paddle -import paddle.v2.dataset.conll05 as conll05 - -word_dict, verb_dict, label_dict = conll05.get_dict() -word_dict_len = len(word_dict) -label_dict_len = len(label_dict) -pred_len = len(verb_dict) - -print len(word_dict_len) -print len(label_dict_len) -print len(pred_len) -``` - -## 模型配置说明 - -1. 定义输入数据维度及模型超参数。 - - ```python - mark_dict_len = 2 # 谓上下文区域标志的维度,是一个0-1 2值特征,因此维度为2 - word_dim = 32 # 词向量维度 - mark_dim = 5 # 谓词上下文区域通过词表被映射为一个实向量,这个是相邻的维度 - hidden_dim = 512 # LSTM隐层向量的维度 : 512 / 4 - depth = 8 # 栈式LSTM的深度 - - # 一条样本总共9个特征,下面定义了9个data层,每个层类型为integer_value_sequence,表示整数ID的序列类型. - def d_type(size): - return paddle.data_type.integer_value_sequence(size) - - # 句子序列 - word = paddle.layer.data(name='word_data', type=d_type(word_dict_len)) - # 谓词 - predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) - - # 谓词上下文5个特征 - ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len)) - ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len)) - ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len)) - ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len)) - ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len)) - - # 谓词上下区域标志 - mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) - - # 标注序列 - target = paddle.layer.data(name='target', type=d_type(label_dict_len)) - ``` - - 这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维,关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。 - -2. 将句子序列、谓词、谓词上下文、谓词上下文区域标记通过词表,转换为实向量表示的词向量序列。 - - ```python - - # 在本教程中,我们加载了预训练的词向量,这里设置了:is_static=True - # is_static 为 True 时保证了在训练 SRL 模型过程中,词表不再更新 - emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True) - # 设置超参数 - default_std = 1 / math.sqrt(hidden_dim) / 3.0 - std_default = paddle.attr.Param(initial_std=default_std) - std_0 = paddle.attr.Param(initial_std=0.) - - predicate_embedding = paddle.layer.embedding( - size=word_dim, - input=predicate, - param_attr=paddle.attr.Param( - name='vemb', initial_std=default_std)) - mark_embedding = paddle.layer.embedding( - size=mark_dim, input=mark, param_attr=std_0) - - word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] - emb_layers = [ - paddle.layer.embedding( - size=word_dim, input=x, param_attr=emb_para) for x in word_input - ] - emb_layers.append(predicate_embedding) - emb_layers.append(mark_embedding) - ``` - -3. 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。 - - ```python - hidden_0 = paddle.layer.mixed( - size=hidden_dim, - bias_attr=std_default, - input=[ - paddle.layer.full_matrix_projection( - input=emb, param_attr=std_default) for emb in emb_layers - ]) - - mix_hidden_lr = 1e-3 - lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0) - hidden_para_attr = paddle.attr.Param( - initial_std=default_std, learning_rate=mix_hidden_lr) - - lstm_0 = paddle.layer.lstmemory( - input=hidden_0, - act=paddle.activation.Relu(), - gate_act=paddle.activation.Sigmoid(), - state_act=paddle.activation.Sigmoid(), - bias_attr=std_0, - param_attr=lstm_para_attr) - - #stack L-LSTM and R-LSTM with direct edges - input_tmp = [hidden_0, lstm_0] - - for i in range(1, depth): - mix_hidden = paddle.layer.mixed( - size=hidden_dim, - bias_attr=std_default, - input=[ - paddle.layer.full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - paddle.layer.full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ]) - - lstm = paddle.layer.lstmemory( - input=mix_hidden, - act=paddle.activation.Relu(), - gate_act=paddle.activation.Sigmoid(), - state_act=paddle.activation.Sigmoid(), - reverse=((i % 2) == 1), - bias_attr=std_0, - param_attr=lstm_para_attr) - - input_tmp = [mix_hidden, lstm] - ``` - -4. 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射,经过一个全连接层映射到标记字典的维度,得到最终的特征向量表示。 - - ```python - feature_out = paddle.layer.mixed( - size=label_dict_len, - bias_attr=std_default, - input=[ - paddle.layer.full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - paddle.layer.full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ], ) - ``` - -5. 网络的末端定义CRF层计算损失(cost),指定参数名字为 `crfw`,该层需要输入正确的数据标签(target)。 - - ```python - crf_cost = paddle.layer.crf( - size=label_dict_len, - input=feature_out, - label=target, - param_attr=paddle.attr.Param( - name='crfw', - initial_std=default_std, - learning_rate=mix_hidden_lr)) - ``` - -6. CRF译码层和CRF层参数名字相同,即共享权重。如果输入了正确的数据标签(target),会统计错误标签的个数,可以用来评估模型。如果没有输入正确的数据标签,该层可以推到出最优解,可以用来预测模型。 - - ```python - crf_dec = paddle.layer.crf_decoding( - name='crf_dec_l', - size=label_dict_len, - input=feature_out, - label=target, - param_attr=paddle.attr.Param(name='crfw')) - ``` - -## 训练模型 - -### 定义参数 - -首先依据模型配置的`crf_cost`定义模型参数。 - -```python -# create parameters -parameters = paddle.parameters.create([crf_cost, crf_dec]) -``` - -可以打印参数名字,如果在网络配置中没有指定名字,则默认生成。 - -```python -print parameters.keys() -``` - -如上文提到,我们用基于英文维基百科训练好的词向量来初始化序列输入、谓词上下文总共6个特征的embedding层参数,在训练中不更新。 - -```python -# 这里加载PaddlePaddle上版保存的二进制模型 -def load_parameter(file_name, h, w): - with open(file_name, 'rb') as f: - f.read(16) - return np.fromfile(f, dtype=np.float32).reshape(h, w) -parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32)) -``` - -### 构造训练(Trainer) - -然后根据网络拓扑结构和模型参数来构造出trainer用来训练,在构造时还需指定优化方法,这里使用最基本的SGD方法(momentum设置为0),同时设定了学习率、正则等。 - -```python -# create optimizer -optimizer = paddle.optimizer.Momentum( - momentum=0, - learning_rate=2e-2, - regularization=paddle.optimizer.L2Regularization(rate=8e-4), - model_average=paddle.optimizer.ModelAverage( - average_window=0.5, max_average_window=10000), ) - -trainer = paddle.trainer.SGD(cost=crf_cost, - parameters=parameters, - update_equation=optimizer) -``` - -### 训练 - -数据介绍部分提到CoNLL 2005训练集付费,这里我们使用测试集训练供大家学习。`conll05.test()`每次产生一条样本,包含9个特征,shuffle和组完batch后作为训练的输入。 - -```python -reader = paddle.reader.batched( - paddle.reader.shuffle( - conll05.test(), buf_size=8192), batch_size=20) -``` - -通过`reader_dict`来指定每一个数据和data_layer的对应关系。 例如 下面`reader_dict`表示: `conll05.test()`产生数据的第0列对应`word_data`层的特征。 - - -```python -reader_dict = { - 'word_data': 0, - 'ctx_n2_data': 1, - 'ctx_n1_data': 2, - 'ctx_0_data': 3, - 'ctx_p1_data': 4, - 'ctx_p2_data': 5, - 'verb_data': 6, - 'mark_data': 7, - 'target': 8 -} -``` - -可以使用`event_handler`回调函数来观察训练过程,或进行测试等。这里我们打印了训练过程的cost,该回调函数是`trainer.train`函数里设定。 - -```python -def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "Pass %d, Batch %d, Cost %f" % ( - event.pass_id, event.batch_id, event.cost) -``` - -通过`trainer.train`函数训练: - -```python -trainer.train( - reader=reader, - event_handler=event_handler, - num_passes=10000, - reader_dict=reader_dict) -``` - -## 总结 - -语义角色标注是许多自然语言理解任务的重要中间步骤。这篇教程中我们以语义角色标注任务为例,介绍如何利用PaddlePaddle进行序列标注任务。教程中所介绍的模型来自我们发表的论文\[[10](#参考文献)\]。由于 CoNLL 2005 SRL任务的训练数据目前并非完全开放,教程中只使用测试数据作为示例。在这个过程中,我们希望减少对其它自然语言处理工具的依赖,利用神经网络数据驱动、端到端学习的能力,得到一个和传统方法可比、甚至更好的模型。在论文中我们证实了这种可能性。关于模型更多的信息和讨论可以在论文中找到。 - -## 参考文献 -1. Sun W, Sui Z, Wang M, et al. [Chinese semantic role labeling with shallow parsing](http://www.aclweb.org/anthology/D09-1#page=1513)[C]//Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 3-Volume 3. Association for Computational Linguistics, 2009: 1475-1483. -2. Pascanu R, Gulcehre C, Cho K, et al. [How to construct deep recurrent neural networks](https://arxiv.org/abs/1312.6026)[J]. arXiv preprint arXiv:1312.6026, 2013. -3. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](https://arxiv.org/abs/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014. -4. Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[J]. arXiv preprint arXiv:1409.0473, 2014. -5. Lafferty J, McCallum A, Pereira F. [Conditional random fields: Probabilistic models for segmenting and labeling sequence data](http://www.jmlr.org/papers/volume15/doppa14a/source/biblio.bib.old)[C]//Proceedings of the eighteenth international conference on machine learning, ICML. 2001, 1: 282-289. -6. 李航. 统计学习方法[J]. 清华大学出版社, 北京, 2012. -7. Marcus M P, Marcinkiewicz M A, Santorini B. [Building a large annotated corpus of English: The Penn Treebank](http://repository.upenn.edu/cgi/viewcontent.cgi?article=1246&context=cis_reports)[J]. Computational linguistics, 1993, 19(2): 313-330. -8. Palmer M, Gildea D, Kingsbury P. [The proposition bank: An annotated corpus of semantic roles](http://www.mitpressjournals.org/doi/pdfplus/10.1162/0891201053630264)[J]. Computational linguistics, 2005, 31(1): 71-106. -9. Carreras X, Màrquez L. [Introduction to the CoNLL-2005 shared task: Semantic role labeling](http://www.cs.upc.edu/~srlconll/st05/papers/intro.pdf)[C]//Proceedings of the Ninth Conference on Computational Natural Language Learning. Association for Computational Linguistics, 2005: 152-164. -10. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015. - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/label_semantic_roles/README.en.md b/label_semantic_roles/README.en.md index 4accfe0aac6eaf7c42b92e3ffaee145d9e8e92b3..d43b8c931e6c4e68a3de115d177edfbcc6df57ee 100644 --- a/label_semantic_roles/README.en.md +++ b/label_semantic_roles/README.en.md @@ -22,34 +22,20 @@ Standard SRL system mostly builds on top of Syntactic Analysis and contains five
-
+
Fig 1. Syntactic parse tree
-核心关系-> HED -定中关系-> ATT -主谓关系-> SBV -状中结构-> ADV -介宾关系-> POB -右附加关系-> RAD -动宾关系-> VOB -标点-> WP - However, complete syntactic analysis requires identifying the relation among all constitutes and the performance of SRL is sensitive to the precision of syntactic analysis, which makes SRL a very challenging task. To reduce the complexity and obtain some syntactic structure information, we often use shallow syntactic analysis. Shallow Syntactic Analysis is also called partial parsing or chunking. Unlike complete syntactic analysis which requires the construction of the complete parsing tree, Shallow Syntactic Analysis only need to identify some independent components with relatively simple structure, such as verb phrases (chunk). To avoid difficulties in constructing a syntactic tree with high accuracy, some work\[[1](#Reference)\] proposed semantic chunking based SRL methods, which convert SRL as a sequence tagging problem. Sequence tagging tasks classify syntactic chunks using BIO representation. For syntactic chunks forming a chunk of type A, the first chunk receives the B-A tag (Begin), the remaining ones receive the tag I-A (Inside), and all chunks outside receive the tag O-A. The BIO representation of above example is shown in Fig.1.
-
+
Fig 2. BIO represention
-输入序列-> input sequence -语块-> chunk -标注序列-> label sequence -角色-> role - This example illustrates the simplicity of sequence tagging because (1) shallow syntactic analysis reduces the precision requirement of syntactic analysis; (2) pruning candidate arguments is removed; 3) argument identification and tagging are finished at the same time. Such unified methods simplify the procedure, reduce the risk of accumulating errors and boost the performance further. In this tutorial, our SRL system is built as an end-to-end system via a neural network. We take only text sequences, without using any syntactic parsing results or complex hand-designed features. We give public dataset [CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/) as an example to illustrate: given a sentence with predicates marked, identify the corresponding arguments and their semantic roles by sequence tagging method. @@ -70,14 +56,11 @@ The operation of a single LSTM cell contain 3 parts: (1) input-to-hidden: map in Fig.3 illustrate the final stacked recurrent neural networks. -

-
+

+
Fig 3. Stacked Recurrent Neural Networks

-线性变换-> linear transformation -输入层到隐层-> input-to-hidden - ### Bidirectional Recurrent Neural Network LSTMs can summarize the history of previous inputs seen up to now, but can not see the future. In most of NLP (natural language processing) tasks, the entire sentences are ready to use. Therefore, sequential learning might be much efficient if the future can be encoded as well like histories. @@ -85,16 +68,11 @@ LSTMs can summarize the history of previous inputs seen up to now, but can not s To address the above drawbacks, we can design bidirectional recurrent neural networks by making a minor modification. Higher LSTM layers process the sequence in reversed direction with previous lower LSTM layers, i.e., Deep LSTMs operate from left-to-right, right-to-left, left-to-right,..., in depth. Therefore, LSTM layers at time-step $t$ can see both histories and the future since the second layer. Fig. 4 illustrates the bidirectional recurrent neural networks. -

-
+

+
Fig 4. Bidirectional LSTMs

-线性变换-> linear transformation -输入层到隐层-> input-to-hidden -正向处理输出序列->process sequence in the forward direction -反向处理上一层序列-> process sequence from the previous layer in backward direction - Note that, this bidirectional RNNs is different with the one proposed by Bengio et al. in machine translation tasks \[[3](#Reference), [4](#Reference)\]. We will introduce another bidirectional RNNs in the following tasks[machine translation](https://github.com/PaddlePaddle/book/blob/develop/machine_translation/README.md) ### Conditional Random Field @@ -106,12 +84,12 @@ CRF is a probabilistic graph model (undirected) with nodes denoting random varia Sequence tagging tasks only consider input and output as linear sequences without extra dependent assumptions on graph model. Thus, the graph model of sequence tagging tasks is simple chain or line, which results in a Linear-Chain Conditional Random Field, shown in Fig.5. -

+


Fig 5. Linear Chain Conditional Random Field used in SRL tasks

-By the fundamental theorem of random fields \[[5](#Reference)\], the joint distribution over the label sequence $Y$ given $X$ has the form: +By the fundamental theorem of random fields \[[5](#Reference)\], the joint distribution over the label sequence $Y$ given $X$ has the form: $$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$ @@ -155,19 +133,11 @@ After modification, the model is as follows: 4. Take representation from step 3 as input of CRF, label sequence as supervision signal, do sequence tagging tasks -
-
+
+
Fig 6. DB-LSTM for SRL tasks
-论元-> argu -谓词-> pred -谓词上下文-> ctx-p -谓词上下文区域标记-> $m_r$ -输入-> input -原句-> sentence -反向LSTM-> LSTM Reverse - ## Data Preparation In the tutorial, we use [CoNLL 2005](http://www.cs.upc.edu/~srlconll/) SRL task open dataset as an example. It is important to note that the training set and development set of the CoNLL 2005 SRL task are not free to download after the competition. Currently, only the test set can be obtained, including 23 sections of the Wall Street Journal and three sections of the Brown corpus. In this tutorial, we use the WSJ corpus as the training dataset to explain the model. However, since the training set is small, if you want to train a usable neural network SRL system, consider paying for the full corpus. @@ -225,6 +195,8 @@ We trained in the English Wikipedia language model to get a word vector lookup t Get dictionary, print dictionary size: ```python +import math +import numpy as np import paddle.v2 as paddle import paddle.v2.dataset.conll05 as conll05 @@ -233,164 +205,164 @@ word_dict_len = len(word_dict) label_dict_len = len(label_dict) pred_len = len(verb_dict) -print len(word_dict_len) -print len(label_dict_len) -print len(pred_len) +print word_dict_len +print label_dict_len +print pred_len ``` ## Model configuration -1. Define input data dimensions and model hyperparameters. - - ```python - mark_dict_len = 2 # Value range of region mark. Region mark is either 0 or 1, so range is 2 - word_dim = 32 # word vector dimension - mark_dim = 5 # adjacent dimension - hidden_dim = 512 # the dimension of LSTM hidden layer vector is 128 (512/4) - depth = 8 # depth of stacked LSTM - - # There are 9 features per sample, so we will define 9 data layers. - # They type for each layer is integer_value_sequence. - def d_type(value_range): - return paddle.data_type.integer_value_sequence(value_range) - - # word sequence - word = paddle.layer.data(name='word_data', type=d_type(word_dict_len)) - # predicate - predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) - - # 5 features for predicate context - ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len)) - ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len)) - ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len)) - ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len)) - ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len)) - - # region marker sequence - mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) - - # label sequence - target = paddle.layer.data(name='target', type=d_type(label_dict_len)) - ``` - - Speciala note: hidden_dim = 512 means LSTM hidden vector of 128 dimension (512/4). Please refer PaddlePaddle official documentation for detail: [lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)。 - -2. The word sequence, predicate, predicate context, and region mark sequence are transformed into embedding vector sequences. - - ```python - - # Since word vectorlookup table is pre-trained, we won't update it this time. - # is_static being True prevents updating the lookup table during training. - emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True) - # hyperparameter configurations - default_std = 1 / math.sqrt(hidden_dim) / 3.0 - std_default = paddle.attr.Param(initial_std=default_std) - std_0 = paddle.attr.Param(initial_std=0.) - - predicate_embedding = paddle.layer.embedding( - size=word_dim, - input=predicate, - param_attr=paddle.attr.Param( - name='vemb', initial_std=default_std)) - mark_embedding = paddle.layer.embedding( - size=mark_dim, input=mark, param_attr=std_0) - - word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] - emb_layers = [ - paddle.layer.embedding( - size=word_dim, input=x, param_attr=emb_para) for x in word_input - ] - emb_layers.append(predicate_embedding) - emb_layers.append(mark_embedding) - ``` - -3. 8 LSTM units will be trained in "forward / backward" order. - - ```python - hidden_0 = paddle.layer.mixed( +- 1. Define input data dimensions and model hyperparameters. + +```python +mark_dict_len = 2 # Value range of region mark. Region mark is either 0 or 1, so range is 2 +word_dim = 32 # word vector dimension +mark_dim = 5 # adjacent dimension +hidden_dim = 512 # the dimension of LSTM hidden layer vector is 128 (512/4) +depth = 8 # depth of stacked LSTM + +# There are 9 features per sample, so we will define 9 data layers. +# They type for each layer is integer_value_sequence. +def d_type(value_range): + return paddle.data_type.integer_value_sequence(value_range) + +# word sequence +word = paddle.layer.data(name='word_data', type=d_type(word_dict_len)) +# predicate +predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) + +# 5 features for predicate context +ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len)) +ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len)) +ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len)) +ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len)) +ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len)) + +# region marker sequence +mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) + +# label sequence +target = paddle.layer.data(name='target', type=d_type(label_dict_len)) +``` + +Speciala note: hidden_dim = 512 means LSTM hidden vector of 128 dimension (512/4). Please refer PaddlePaddle official documentation for detail: [lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)。 + +- 2. The word sequence, predicate, predicate context, and region mark sequence are transformed into embedding vector sequences. + +```python + +# Since word vectorlookup table is pre-trained, we won't update it this time. +# is_static being True prevents updating the lookup table during training. +emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True) +# hyperparameter configurations +default_std = 1 / math.sqrt(hidden_dim) / 3.0 +std_default = paddle.attr.Param(initial_std=default_std) +std_0 = paddle.attr.Param(initial_std=0.) + +predicate_embedding = paddle.layer.embedding( + size=word_dim, + input=predicate, + param_attr=paddle.attr.Param( + name='vemb', initial_std=default_std)) +mark_embedding = paddle.layer.embedding( + size=mark_dim, input=mark, param_attr=std_0) + +word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] +emb_layers = [ + paddle.layer.embedding( + size=word_dim, input=x, param_attr=emb_para) for x in word_input +] +emb_layers.append(predicate_embedding) +emb_layers.append(mark_embedding) +``` + +- 3. 8 LSTM units will be trained in "forward / backward" order. + +```python +hidden_0 = paddle.layer.mixed( + size=hidden_dim, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=emb, param_attr=std_default) for emb in emb_layers + ]) + +mix_hidden_lr = 1e-3 +lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0) +hidden_para_attr = paddle.attr.Param( + initial_std=default_std, learning_rate=mix_hidden_lr) + +lstm_0 = paddle.layer.lstmemory( + input=hidden_0, + act=paddle.activation.Relu(), + gate_act=paddle.activation.Sigmoid(), + state_act=paddle.activation.Sigmoid(), + bias_attr=std_0, + param_attr=lstm_para_attr) + +# stack L-LSTM and R-LSTM with direct edges +input_tmp = [hidden_0, lstm_0] + +for i in range(1, depth): + mix_hidden = paddle.layer.mixed( size=hidden_dim, bias_attr=std_default, input=[ paddle.layer.full_matrix_projection( - input=emb, param_attr=std_default) for emb in emb_layers + input=input_tmp[0], param_attr=hidden_para_attr), + paddle.layer.full_matrix_projection( + input=input_tmp[1], param_attr=lstm_para_attr) ]) - mix_hidden_lr = 1e-3 - lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0) - hidden_para_attr = paddle.attr.Param( - initial_std=default_std, learning_rate=mix_hidden_lr) - - lstm_0 = paddle.layer.lstmemory( - input=hidden_0, + lstm = paddle.layer.lstmemory( + input=mix_hidden, act=paddle.activation.Relu(), gate_act=paddle.activation.Sigmoid(), state_act=paddle.activation.Sigmoid(), + reverse=((i % 2) == 1), bias_attr=std_0, param_attr=lstm_para_attr) - # stack L-LSTM and R-LSTM with direct edges - input_tmp = [hidden_0, lstm_0] - - for i in range(1, depth): - mix_hidden = paddle.layer.mixed( - size=hidden_dim, - bias_attr=std_default, - input=[ - paddle.layer.full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - paddle.layer.full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ]) - - lstm = paddle.layer.lstmemory( - input=mix_hidden, - act=paddle.activation.Relu(), - gate_act=paddle.activation.Sigmoid(), - state_act=paddle.activation.Sigmoid(), - reverse=((i % 2) == 1), - bias_attr=std_0, - param_attr=lstm_para_attr) - - input_tmp = [mix_hidden, lstm] - ``` - -4. We will concatenate the output of top LSTM unit with it's input, and project into a hidden layer. Then put a fully connected layer on top of it to get the final vector representation. - - ```python - feature_out = paddle.layer.mixed( + input_tmp = [mix_hidden, lstm] +``` + +- 4. We will concatenate the output of top LSTM unit with it's input, and project into a hidden layer. Then put a fully connected layer on top of it to get the final vector representation. + + ```python + feature_out = paddle.layer.mixed( + size=label_dict_len, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=input_tmp[0], param_attr=hidden_para_attr), + paddle.layer.full_matrix_projection( + input=input_tmp[1], param_attr=lstm_para_attr) + ], ) + ``` + +- 5. We use CRF as cost function, the parameter of CRF cost will be named `crfw`. + +```python +crf_cost = paddle.layer.crf( size=label_dict_len, - bias_attr=std_default, - input=[ - paddle.layer.full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - paddle.layer.full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ], ) - ``` - -5. We use CRF as cost function, the parameter of CRF cost will be named `crfw`. - - ```python - crf_cost = paddle.layer.crf( - size=label_dict_len, - input=feature_out, - label=target, - param_attr=paddle.attr.Param( - name='crfw', - initial_std=default_std, - learning_rate=mix_hidden_lr)) - ``` - -6. CRF decoding layer is used for evaluation and inference. It shares parameter with CRF layer. The sharing of parameters among multiple layers is specified by the same parameter name in these layers. - - ```python - crf_dec = paddle.layer.crf_decoding( - name='crf_dec_l', - size=label_dict_len, - input=feature_out, - label=target, - param_attr=paddle.attr.Param(name='crfw')) - ``` + input=feature_out, + label=target, + param_attr=paddle.attr.Param( + name='crfw', + initial_std=default_std, + learning_rate=mix_hidden_lr)) +``` + +- 6. CRF decoding layer is used for evaluation and inference. It shares parameter with CRF layer. The sharing of parameters among multiple layers is specified by the same parameter name in these layers. + +```python +crf_dec = paddle.layer.crf_decoding( + name='crf_dec_l', + size=label_dict_len, + input=feature_out, + label=target, + param_attr=paddle.attr.Param(name='crfw')) +``` ## Train model @@ -403,7 +375,7 @@ parameters = paddle.parameters.create([crf_cost, crf_dec]) ``` We can print out parameter name. It will be generated if not specified. - + ```python print parameters.keys() ``` @@ -413,8 +385,8 @@ Now we load pre-trained word lookup table. ```python def load_parameter(file_name, h, w): with open(file_name, 'rb') as f: - f.read(16) - return np.fromfile(f, dtype=np.float32).reshape(h, w) + f.read(16) + return np.fromfile(f, dtype=np.float32).reshape(h, w) parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32)) ``` @@ -440,15 +412,15 @@ trainer = paddle.trainer.SGD(cost=crf_cost, As mentioned in data preparation section, we will use CoNLL 2005 test corpus as training data set. `conll05.test()` outputs one training instance at a time. It will be shuffled, and batched into mini batches as input. ```python -reader = paddle.reader.batched( +reader = paddle.batch( paddle.reader.shuffle( conll05.test(), buf_size=8192), batch_size=20) ``` -`reader_dict` is used to specify relationship between data instance and layer layer. For example, according to following `reader_dict`, the 0th column of data instance produced by`conll05.test()` correspond to data layer named `word_data`. +`feeding` is used to specify relationship between data instance and layer layer. For example, according to following `feeding`, the 0th column of data instance produced by`conll05.test()` correspond to data layer named `word_data`. ```python -reader_dict = { +feeding = { 'word_data': 0, 'ctx_n2_data': 1, 'ctx_n1_data': 2, @@ -478,7 +450,7 @@ trainer.train( reader=reader, event_handler=event_handler, num_passes=10000, - reader_dict=reader_dict) + feeding=feeding) ``` ## Conclusion diff --git a/label_semantic_roles/README.md b/label_semantic_roles/README.md index a6b4333130a4b8813fe89a19b517bf43dcd8624e..9fcf1ae84c3f83d68f0e57be320928ff83000e67 100644 --- a/label_semantic_roles/README.md +++ b/label_semantic_roles/README.md @@ -52,7 +52,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb 图3是最终得到的栈式循环神经网络结构示意图。 -

+


图3. 基于LSTM的栈式循环神经网络结构示意图

@@ -63,7 +63,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb 为了克服这一缺陷,我们可以设计一种双向循环网络单元,它的思想简单且直接:对上一节的栈式循环神经网络进行一个小小的修改,堆叠多个LSTM单元,让每一层LSTM单元分别以:正向、反向、正向 …… 的顺序学习上一层的输出序列。于是,从第2层开始,$t$时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。 -

+


图4. 基于LSTM的双向循环神经网络结构示意图

@@ -78,7 +78,7 @@ CRF是一种概率化结构模型,可以看作是一个概率无向图模型 序列标注任务只需要考虑输入和输出都是一个线性序列,并且由于我们只是将输入序列作为条件,不做任何条件独立假设,因此输入序列的元素之间并不存在图结构。综上,在序列标注任务中使用的是如图5所示的定义在链式图上的CRF,称之为线性链条件随机场(Linear Chain Conditional Random Field)。 -

+


图5. 序列标注任务中使用的线性链条件随机场

@@ -122,15 +122,15 @@ $$L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \fra 3. 第2步的4个词向量序列作为双向LSTM模型的输入;LSTM模型学习输入序列的特征表示,得到新的特性表示序列; 4. CRF以第3步中LSTM学习到的特征为输入,以标记序列为监督信号,完成序列标注; -
+

图6. SRL任务上的深层双向LSTM模型
-## 数据准备 -### 数据介绍与下载 -在此教程中,我们选用[CoNLL 2005](http://www.cs.upc.edu/~srlconll/)SRL任务开放出的数据集作为示例。运行 `sh ./get_data.sh` 会自动从官方网站上下载原始数据。需要特别说明的是,CoNLL 2005 SRL任务的训练数集和开发集在比赛之后并非免费进行公开,目前,能够获取到的只有测试集,包括Wall Street Journal的23节和Brown语料集中的3节。在本教程中,我们以测试集中的WSJ数据为训练集来讲解模型。但是,由于测试集中样本的数量远远不够,如果希望训练一个可用的神经网络SRL系统,请考虑付费获取全量数据。 +## 数据介绍 + +在此教程中,我们选用[CoNLL 2005](http://www.cs.upc.edu/~srlconll/)SRL任务开放出的数据集作为示例。需要特别说明的是,CoNLL 2005 SRL任务的训练数集和开发集在比赛之后并非免费进行公开,目前,能够获取到的只有测试集,包括Wall Street Journal的23节和Brown语料集中的3节。在本教程中,我们以测试集中的WSJ数据为训练集来讲解模型。但是,由于测试集中样本的数量远远不够,如果希望训练一个可用的神经网络SRL系统,请考虑付费获取全量数据。 原始数据中同时包括了词性标注、命名实体识别、语法解析树等多种信息。本教程中,我们使用test.wsj文件夹中的数据进行训练和测试,并只会用到words文件夹(文本序列)和props文件夹(标注结果)下的数据。本教程使用的数据目录如下: @@ -143,28 +143,25 @@ conll05st-release/ 标注信息源自Penn TreeBank\[[7](#参考文献)\]和PropBank\[[8](#参考文献)\]的标注结果。PropBank标注结果的标签和我们在文章一开始示例中使用的标注结果标签不同,但原理是相同的,关于标注结果标签含义的说明,请参考论文\[[9](#参考文献)\]。 -除数据之外,`get_data.sh`同时下载了以下资源: - -| 文件名称 | 说明 | -|---|---| -| word_dict | 输入句子的词典,共计44068个词 | -| label_dict | 标记的词典,共计106个标记 | -| predicate_dict | 谓词的词典,共计3162个词 | -| emb | 一个训练好的词表,32维 | - -我们在英文维基百科上训练语言模型得到了一份词向量用来初始化SRL模型。在SRL模型训练过程中,词向量不再被更新。关于语言模型和词向量可以参考[词向量](https://github.com/PaddlePaddle/book/blob/develop/word2vec/README.md) 这篇教程。我们训练语言模型的语料共有995,000,000个token,词典大小控制为4900,000词。CoNLL 2005训练语料中有5%的词不在这4900,000个词中,我们将它们全部看作未登录词,用``表示。 - -### 数据预处理 -脚本在下载数据之后,又调用了`extract_pair.py`和`extract_dict_feature.py`两个子脚本进行数据预处理,前者完成了下面的第1步,后者完成了下面的2~4步: +原始数据需要进行数据预处理才能被PaddlePaddle处理,预处理包括下面几个步骤: 1. 将文本序列和标记序列其合并到一条记录中; 2. 一个句子如果含有$n$个谓词,这个句子会被处理$n$次,变成$n$条独立的训练样本,每个样本一个不同的谓词; 3. 抽取谓词上下文和构造谓词上下文区域标记; 4. 构造以BIO法表示的标记; +5. 依据词典获取词对应的整数索引。 + -`data/feature`文件是处理好的模型输入,一行是一条训练样本,以"\t"分隔,共9列,分别是:句子序列、谓词、谓词上下文(占 5 列)、谓词上下区域标志、标注序列。下表是一条训练样本的示例。 +```python +# import paddle.v2.dataset.conll05 as conll05 +# conll05.corpus_reader函数完成上面第1步和第2步. +# conll05.reader_creator函数完成上面第3步到第5步. +# conll05.test函数可以获取处理之后的每条样本来供PaddlePaddle训练. +``` -| 句子序列 | 谓词 | 谓词上下文(窗口 = 5) | 谓词上下文区域标记 | 标注序列 | +预处理完成之后一条训练样本包含9个特征,分别是:句子序列、谓词、谓词上下文(占 5 列)、谓词上下区域标志、标注序列。下表是一条训练样本的示例。 + +| 句子序列 | 谓词 | 谓词上下文(窗口 = 5) | 谓词上下文区域标记 | 标注序列 | |---|---|---|---|---| | A | set | n't been set . × | 0 | B-A1 | | record | set | n't been set . × | 0 | I-A1 | @@ -175,288 +172,282 @@ conll05st-release/ | set | set | n't been set . × | 1 | B-V | | . | set | n't been set . × | 1 | O | -### 提供数据给 PaddlePaddle -1. 使用hook函数进行PaddlePaddle输入字段的格式定义。 - - ```python - def hook(settings, word_dict, label_dict, predicate_dict, **kwargs): - settings.word_dict = word_dict # 获取句子序列的字典 - settings.label_dict = label_dict # 获取标记序列的字典 - settings.predicate_dict = predicate_dict # 获取谓词的字典 - - # 所有输入特征都是使用one-hot表示序列,在PaddlePaddle中是interger_value_sequence类型 - # input_types是一个字典,字典中每个元素对应着配置中的一个data_layer,key恰好就是data_layer的名字 - - settings.input_types = { - 'word_data': integer_value_sequence(len(word_dict)), # 句子序列 - 'ctx_n2_data': integer_value_sequence(len(word_dict)), # 谓词上下文中的第1个词 - 'ctx_n1_data': integer_value_sequence(len(word_dict)), # 谓词上下文中的第2个词 - 'ctx_0_data': integer_value_sequence(len(word_dict)), # 谓词上下文中的第3个词 - 'ctx_p1_data': integer_value_sequence(len(word_dict)), # 谓词上下文中的第4个词 - 'ctx_p2_data': integer_value_sequence(len(word_dict)), # 谓词上下文中的第5个词 - 'verb_data': integer_value_sequence(len(predicate_dict)), # 谓词 - 'mark_data': integer_value_sequence(2), # 谓词上下文区域标记 - 'target': integer_value_sequence(len(label_dict)) # 标记序列 - } - ``` - -2. 使用process将数据逐一提供给PaddlePaddle,只需要考虑如何从原始数据文件中返回一条训练样本。 - - ```python - def process(settings, file_name): - with open(file_name, 'r') as fdata: - for line in fdata: - sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \ - line.strip().split('\t') - - # 句子文本 - words = sentence.split() - sen_len = len(words) - word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words] - - # 一个谓词,这里将谓词扩展成一个和句子一样长的序列 - predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len - - # 在教程中,我们使用一个窗口为 5 的谓词上下文窗口:谓词和这个谓词前后隔两个词 - # 这里会将窗口中的每一个词,扩展成和输入句子一样长的序列 - ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len - ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len - ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len - ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len - ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len - - # 谓词上下文区域标记,是一个二值特征 - marks = mark.split() - mark_slot = [int(w) for w in marks] - - label_list = label.split() - label_slot = [settings.label_dict.get(w) for w in label_list] - yield { - 'word_data': word_slot, - 'ctx_n2_data': ctx_n2_slot, - 'ctx_n1_data': ctx_n1_slot, - 'ctx_0_data': ctx_0_slot, - 'ctx_p1_data': ctx_p1_slot, - 'ctx_p2_data': ctx_p2_slot, - 'verb_data': predicate_slot, - 'mark_data': mark_slot, - 'target': label_slot - } - ``` -## 模型配置说明 +除数据之外,我们同时提供了以下资源: -### 数据定义 +| 文件名称 | 说明 | +|---|---| +| word_dict | 输入句子的词典,共计44068个词 | +| label_dict | 标记的词典,共计106个标记 | +| predicate_dict | 谓词的词典,共计3162个词 | +| emb | 一个训练好的词表,32维 | -首先通过 define_py_data_sources2 从dataprovider中读入数据。配置文件中会读取三个字典:输入文本序列的字典、标记的字典、谓词的字典,并传给data provider,data provider会利用这三个字典,将相应的文本输入转换成one-hot序列。 +我们在英文维基百科上训练语言模型得到了一份词向量用来初始化SRL模型。在SRL模型训练过程中,词向量不再被更新。关于语言模型和词向量可以参考[词向量](https://github.com/PaddlePaddle/book/blob/develop/word2vec/README.md) 这篇教程。我们训练语言模型的语料共有995,000,000个token,词典大小控制为4900,000词。CoNLL 2005训练语料中有5%的词不在这4900,000个词中,我们将它们全部看作未登录词,用``表示。 + +获取词典,打印词典大小: ```python -define_py_data_sources2( - train_list=train_list_file, - test_list=test_list_file, - module='dataprovider', - obj='process', - args={ - 'word_dict': word_dict, # 输入文本序列的字典 - 'label_dict': label_dict, # 标记的字典 - 'predicate_dict': predicate_dict # 谓词的词典 - } -) +import math +import numpy as np +import paddle.v2 as paddle +import paddle.v2.dataset.conll05 as conll05 + +paddle.init(use_gpu=False, trainer_count=1) + +word_dict, verb_dict, label_dict = conll05.get_dict() +word_dict_len = len(word_dict) +label_dict_len = len(label_dict) +pred_len = len(verb_dict) + +print word_dict_len +print label_dict_len +print pred_len ``` -### 算法配置 -在这里,我们指定了模型的训练参数,选择了$L_2$正则、学习率和batch size,并使用带Momentum的随机梯度下降法作为优化算法。 +## 模型配置说明 + +- 1. 定义输入数据维度及模型超参数。 ```python -settings( - batch_size=150, - learning_method=MomentumOptimizer(momentum=0), - learning_rate=2e-2, - regularization=L2Regularization(8e-4), - model_average=ModelAverage(average_window=0.5, max_average_window=10000) -) +mark_dict_len = 2 # 谓上下文区域标志的维度,是一个0-1 2值特征,因此维度为2 +word_dim = 32 # 词向量维度 +mark_dim = 5 # 谓词上下文区域通过词表被映射为一个实向量,这个是相邻的维度 +hidden_dim = 512 # LSTM隐层向量的维度 : 512 / 4 +depth = 8 # 栈式LSTM的深度 + +# 一条样本总共9个特征,下面定义了9个data层,每个层类型为integer_value_sequence,表示整数ID的序列类型. +def d_type(size): + return paddle.data_type.integer_value_sequence(size) + +# 句子序列 +word = paddle.layer.data(name='word_data', type=d_type(word_dict_len)) +# 谓词 +predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) + +# 谓词上下文5个特征 +ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len)) +ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len)) +ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len)) +ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len)) +ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len)) + +# 谓词上下区域标志 +mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) + +# 标注序列 +target = paddle.layer.data(name='target', type=d_type(label_dict_len)) ``` -### 模型结构 - -1. 定义输入数据维度及模型超参数。 - - ```python - mark_dict_len = 2 # 谓上下文区域标志的维度,是一个0-1 2值特征,因此维度为2 - word_dim = 32 # 词向量维度 - mark_dim = 5 # 谓词上下文区域通过词表被映射为一个实向量,这个是相邻的维度 - hidden_dim = 512 # LSTM隐层向量的维度 : 512 / 4 - depth = 8 # 栈式LSTM的深度 - - word = data_layer(name='word_data', size=word_dict_len) - predicate = data_layer(name='verb_data', size=pred_len) - - ctx_n2 = data_layer(name='ctx_n2_data', size=word_dict_len) - ctx_n1 = data_layer(name='ctx_n1_data', size=word_dict_len) - ctx_0 = data_layer(name='ctx_0_data', size=word_dict_len) - ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len) - ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len) - mark = data_layer(name='mark_data', size=mark_dict_len) - - if not is_predict: - target = data_layer(name='target', size=label_dict_len) # 标记序列只在训练和测试流程中定义 - ``` 这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维,关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。 -2. 将句子序列、谓词、谓词上下文、谓词上下文区域标记通过词表,转换为实向量表示的词向量序列。 - - ```python - - # 在本教程中,我们加载了预训练的词向量,这里设置了:is_static=True - # is_static 为 True 时保证了在训练 SRL 模型过程中,词表不再更新 - emb_para = ParameterAttribute(name='emb', initial_std=0., is_static=True) - - word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] - emb_layers = [ - embedding_layer( - size=word_dim, input=x, param_attr=emb_para) for x in word_input - ] - emb_layers.append(predicate_embedding) - mark_embedding = embedding_layer( - name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0) - emb_layers.append(mark_embedding) - ``` - -3. 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。 - - ```python - # std_0 指定的参数以均值为0的高斯分布初始化,用在LSTM的bias初始化中 - std_0 = ParameterAttribute(initial_std=0.) - - hidden_0 = mixed_layer( - name='hidden0', - size=hidden_dim, - bias_attr=std_default, - input=[ - full_matrix_projection( - input=emb, param_attr=std_default) for emb in emb_layers - ]) - lstm_0 = lstmemory( - name='lstm0', - input=hidden_0, - act=ReluActivation(), - gate_act=SigmoidActivation(), - state_act=SigmoidActivation(), - bias_attr=std_0, - param_attr=lstm_para_attr) - input_tmp = [hidden_0, lstm_0] - - for i in range(1, depth): - mix_hidden = mixed_layer( - name='hidden' + str(i), - size=hidden_dim, - bias_attr=std_default, - input=[ - full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ]) - lstm = lstmemory( - name='lstm' + str(i), - input=mix_hidden, - act=ReluActivation(), - gate_act=SigmoidActivation(), - state_act=SigmoidActivation(), - reverse=((i % 2) == 1), - bias_attr=std_0, - param_attr=lstm_para_attr) - - input_tmp = [mix_hidden, lstm] - ``` - -4. 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射,经过一个全连接层映射到标记字典的维度,得到最终的特征向量表示。 - - ```python - feature_out = mixed_layer( - name='output', - size=label_dict_len, - bias_attr=std_default, - input=[ - full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ], ) - ``` - -5. CRF层在网络的末端,完成序列标注。 - - ```python - crf_l = crf_layer( - name='crf', - size=label_dict_len, - input=feature_out, - label=target, - param_attr=ParameterAttribute( - name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr)) - ``` +- 2. 将句子序列、谓词、谓词上下文、谓词上下文区域标记通过词表,转换为实向量表示的词向量序列。 + +```python + +# 在本教程中,我们加载了预训练的词向量,这里设置了:is_static=True +# is_static 为 True 时保证了在训练 SRL 模型过程中,词表不再更新 +emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True) +# 设置超参数 +default_std = 1 / math.sqrt(hidden_dim) / 3.0 +std_default = paddle.attr.Param(initial_std=default_std) +std_0 = paddle.attr.Param(initial_std=0.) + +predicate_embedding = paddle.layer.embedding( + size=word_dim, + input=predicate, + param_attr=paddle.attr.Param( + name='vemb', initial_std=default_std)) +mark_embedding = paddle.layer.embedding( + size=mark_dim, input=mark, param_attr=std_0) + +word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] +emb_layers = [ + paddle.layer.embedding( + size=word_dim, input=x, param_attr=emb_para) for x in word_input +] +emb_layers.append(predicate_embedding) +emb_layers.append(mark_embedding) +``` + +- 3. 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。 + +```python +hidden_0 = paddle.layer.mixed( +size=hidden_dim, +bias_attr=std_default, +input=[ + paddle.layer.full_matrix_projection( + input=emb, param_attr=std_default) for emb in emb_layers +]) + +mix_hidden_lr = 1e-3 +lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0) +hidden_para_attr = paddle.attr.Param( + initial_std=default_std, learning_rate=mix_hidden_lr) + +lstm_0 = paddle.layer.lstmemory( + input=hidden_0, + act=paddle.activation.Relu(), + gate_act=paddle.activation.Sigmoid(), + state_act=paddle.activation.Sigmoid(), + bias_attr=std_0, + param_attr=lstm_para_attr) + +#stack L-LSTM and R-LSTM with direct edges +input_tmp = [hidden_0, lstm_0] + +for i in range(1, depth): + mix_hidden = paddle.layer.mixed( + size=hidden_dim, + bias_attr=std_default, + input=[ + paddle.layer.full_matrix_projection( + input=input_tmp[0], param_attr=hidden_para_attr), + paddle.layer.full_matrix_projection( + input=input_tmp[1], param_attr=lstm_para_attr) + ]) + + lstm = paddle.layer.lstmemory( + input=mix_hidden, + act=paddle.activation.Relu(), + gate_act=paddle.activation.Sigmoid(), + state_act=paddle.activation.Sigmoid(), + reverse=((i % 2) == 1), + bias_attr=std_0, + param_attr=lstm_para_attr) + + input_tmp = [mix_hidden, lstm] +``` + +- 4. 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射,经过一个全连接层映射到标记字典的维度,得到最终的特征向量表示。 + +```python +feature_out = paddle.layer.mixed( +size=label_dict_len, +bias_attr=std_default, +input=[ + paddle.layer.full_matrix_projection( + input=input_tmp[0], param_attr=hidden_para_attr), + paddle.layer.full_matrix_projection( + input=input_tmp[1], param_attr=lstm_para_attr) +], ) +``` + +- 5. 网络的末端定义CRF层计算损失(cost),指定参数名字为 `crfw`,该层需要输入正确的数据标签(target)。 + +```python +crf_cost = paddle.layer.crf( + size=label_dict_len, + input=feature_out, + label=target, + param_attr=paddle.attr.Param( + name='crfw', + initial_std=default_std, + learning_rate=mix_hidden_lr)) +``` + +- 6. CRF译码层和CRF层参数名字相同,即共享权重。如果输入了正确的数据标签(target),会统计错误标签的个数,可以用来评估模型。如果没有输入正确的数据标签,该层可以推到出最优解,可以用来预测模型。 + +```python +crf_dec = paddle.layer.crf_decoding( + name='crf_dec_l', + size=label_dict_len, + input=feature_out, + label=target, + param_attr=paddle.attr.Param(name='crfw')) +``` ## 训练模型 -执行`sh train.sh`进行模型的训练,其中指定了总共需要训练150个pass。 - -```bash -paddle train \ - --config=./db_lstm.py \ - --save_dir=./output \ - --trainer_count=1 \ - --dot_period=500 \ - --log_period=10 \ - --num_passes=200 \ - --use_gpu=false \ - --show_parameter_stats_period=10 \ - --test_all_data_in_one_period=1 \ -2>&1 | tee 'train.log' + +### 定义参数 + +首先依据模型配置的`crf_cost`定义模型参数。 + +```python +# create parameters +parameters = paddle.parameters.create([crf_cost, crf_dec]) ``` -训练日志示例如下。 +可以打印参数名字,如果在网络配置中没有指定名字,则默认生成。 -```text -I1224 18:11:53.661479 1433 TrainerInternal.cpp:165] Batch=880 samples=145305 AvgCost=2.11541 CurrentCost=1.8645 Eval: __sum_evaluator_0__=0.607942 CurrentEval: __sum_evaluator_0__=0.59322 -I1224 18:11:55.254021 1433 TrainerInternal.cpp:165] Batch=885 samples=146134 AvgCost=2.11408 CurrentCost=1.88156 Eval: __sum_evaluator_0__=0.607299 CurrentEval: __sum_evaluator_0__=0.494572 -I1224 18:11:56.867604 1433 TrainerInternal.cpp:165] Batch=890 samples=146987 AvgCost=2.11277 CurrentCost=1.88839 Eval: __sum_evaluator_0__=0.607203 CurrentEval: __sum_evaluator_0__=0.590856 -I1224 18:11:58.424069 1433 TrainerInternal.cpp:165] Batch=895 samples=147793 AvgCost=2.11129 CurrentCost=1.84247 Eval: __sum_evaluator_0__=0.607099 CurrentEval: __sum_evaluator_0__=0.588089 -I1224 18:12:00.006893 1433 TrainerInternal.cpp:165] Batch=900 samples=148611 AvgCost=2.11148 CurrentCost=2.14526 Eval: __sum_evaluator_0__=0.607882 CurrentEval: __sum_evaluator_0__=0.749389 -I1224 18:12:00.164089 1433 TrainerInternal.cpp:181] Pass=0 Batch=901 samples=148647 AvgCost=2.11195 Eval: __sum_evaluator_0__=0.60793 +```python +print parameters.keys() ``` -经过150个 pass 后,得到平均 error 约为 0.0516055。 -## 应用模型 +如上文提到,我们用基于英文维基百科训练好的词向量来初始化序列输入、谓词上下文总共6个特征的embedding层参数,在训练中不更新。 -训练好的$N$个pass,会得到$N$个模型,我们需要从中选择一个最优模型进行预测。通常做法是在开发集上进行调参,并基于我们关心的某个性能指标选择最优模型。本教程的`predict.sh`脚本简单地选择了测试集上标记错误最少的那个pass(这里是pass-00100)用于预测。 +```python +# 这里加载PaddlePaddle上版保存的二进制模型 +def load_parameter(file_name, h, w): + with open(file_name, 'rb') as f: + f.read(16) + return np.fromfile(f, dtype=np.float32).reshape(h, w) +parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32)) +``` -预测时,我们需要将配置中的 `crf_layer` 删掉,替换为 `crf_decoding_layer`,如下所示: +### 构造训练(Trainer) + +然后根据网络拓扑结构和模型参数来构造出trainer用来训练,在构造时还需指定优化方法,这里使用最基本的SGD方法(momentum设置为0),同时设定了学习率、正则等。 ```python -crf_dec_l = crf_decoding_layer( - name='crf_dec_l', - size=label_dict_len, - input=feature_out, - param_attr=ParameterAttribute(name='crfw')) +# create optimizer +optimizer = paddle.optimizer.Momentum( + momentum=0, + learning_rate=2e-2, + regularization=paddle.optimizer.L2Regularization(rate=8e-4), + model_average=paddle.optimizer.ModelAverage( + average_window=0.5, max_average_window=10000), ) + +trainer = paddle.trainer.SGD(cost=crf_cost, + parameters=parameters, + update_equation=optimizer) ``` -运行`python predict.py`脚本,便可使用指定的模型进行预测。 - -```bash -python predict.py - -c db_lstm.py # 指定配置文件 - -w output/pass-00100 # 指定预测使用的模型所在的路径 - -l data/targetDict.txt # 指定标记的字典 - -p data/verbDict.txt # 指定谓词的词典 - -d data/wordDict.txt # 指定输入文本序列的字典 - -i data/feature # 指定输入数据的路径 - -o predict.res # 指定标记结果输出到文件的路径 +### 训练 + +数据介绍部分提到CoNLL 2005训练集付费,这里我们使用测试集训练供大家学习。`conll05.test()`每次产生一条样本,包含9个特征,shuffle和组完batch后作为训练的输入。 + +```python +reader = paddle.batch( + paddle.reader.shuffle( + conll05.test(), buf_size=8192), batch_size=20) ``` -预测结束后,在 - o 参数所指定的标记结果文件中,我们会得到如下格式的输出:每行是一条样本,以 “\t” 分隔的 2 列,第一列是输入文本,第二列是标记的结果。通过BIO标记可以直接得到论元的语义角色标签。 +通过`feeding`来指定每一个数据和data_layer的对应关系。 例如 下面`feeding`表示: `conll05.test()`产生数据的第0列对应`word_data`层的特征。 -```text -The interest-only securities were priced at 35 1\/2 to yield 10.72 % . B-A0 I-A0 I-A0 O O O O O O B-V B-A1 I-A1 O + +```python +feeding = { + 'word_data': 0, + 'ctx_n2_data': 1, + 'ctx_n1_data': 2, + 'ctx_0_data': 3, + 'ctx_p1_data': 4, + 'ctx_p2_data': 5, + 'verb_data': 6, + 'mark_data': 7, + 'target': 8 +} +``` + +可以使用`event_handler`回调函数来观察训练过程,或进行测试等。这里我们打印了训练过程的cost,该回调函数是`trainer.train`函数里设定。 + +```python +def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + print "Pass %d, Batch %d, Cost %f" % ( + event.pass_id, event.batch_id, event.cost) +``` + +通过`trainer.train`函数训练: + +```python +trainer.train( + reader=reader, + event_handler=event_handler, + num_passes=10000, + feeding=feeding) ``` ## 总结 diff --git a/label_semantic_roles/data/extract_pairs.py b/label_semantic_roles/data/extract_pairs.py index 94a8488c16734eb1882d54f7ec36f4b9308c09d4..d0290c44cf089990d2cf30137834132cb72ab5ea 100644 --- a/label_semantic_roles/data/extract_pairs.py +++ b/label_semantic_roles/data/extract_pairs.py @@ -20,7 +20,7 @@ from optparse import OptionParser def read_labels(props_file): ''' a sentence maybe has more than one verb, each verb has its label sequence - label[], is a 3-dimension list. + label[], is a 3-dimension list. the first dim is to store all sentence's label seqs, len is the sentence number the second dim is to store all label sequences for one sentences the third dim is to store each label for one word diff --git a/label_semantic_roles/image/bd_lstm_en.png b/label_semantic_roles/image/bd_lstm_en.png new file mode 100755 index 0000000000000000000000000000000000000000..c3646312e48db977402fb353dc0c9b4d02269bf4 Binary files /dev/null and b/label_semantic_roles/image/bd_lstm_en.png differ diff --git a/label_semantic_roles/image/bidirectional_stacked_lstm_en.png b/label_semantic_roles/image/bidirectional_stacked_lstm_en.png new file mode 100755 index 0000000000000000000000000000000000000000..f0a195c24d9ee493f96bb93c28a99e70566be7a4 Binary files /dev/null and b/label_semantic_roles/image/bidirectional_stacked_lstm_en.png differ diff --git a/label_semantic_roles/image/bio_example.png b/label_semantic_roles/image/bio_example.png old mode 100644 new mode 100755 index 9ffebf26e6b5f879849e24061bfcc1a3b36d2f9d..e5f7151c9fcc50a7cf7af485cbbc7e4fccab0c20 Binary files a/label_semantic_roles/image/bio_example.png and b/label_semantic_roles/image/bio_example.png differ diff --git a/label_semantic_roles/image/bio_example_en.png b/label_semantic_roles/image/bio_example_en.png new file mode 100755 index 0000000000000000000000000000000000000000..93b44dd4874402ef29ad7bd7d94147609b92e309 Binary files /dev/null and b/label_semantic_roles/image/bio_example_en.png differ diff --git a/label_semantic_roles/image/dependency_parsing.png b/label_semantic_roles/image/dependency_parsing.png old mode 100644 new mode 100755 index e54df49321d0607b0c3ae3300d38176a21f50d57..9265b671735940ed6549e2980064d2ce08baae64 Binary files a/label_semantic_roles/image/dependency_parsing.png and b/label_semantic_roles/image/dependency_parsing.png differ diff --git a/label_semantic_roles/image/dependency_parsing_en.png b/label_semantic_roles/image/dependency_parsing_en.png new file mode 100755 index 0000000000000000000000000000000000000000..23f4f45b603e3d60702af2b2464d10fc8deed061 Binary files /dev/null and b/label_semantic_roles/image/dependency_parsing_en.png differ diff --git a/label_semantic_roles/image/stacked_lstm_en.png b/label_semantic_roles/image/stacked_lstm_en.png new file mode 100755 index 0000000000000000000000000000000000000000..0b944ef91e8b5ba4b14d2a35bd8879f261cf8f61 Binary files /dev/null and b/label_semantic_roles/image/stacked_lstm_en.png differ diff --git a/label_semantic_roles/index.en.html b/label_semantic_roles/index.en.html index 790fdb3dd97c25cbccd44dff3dd7848858c41bdc..6302f9fd6c76708777139d1b4d494fe2959f7df8 100644 --- a/label_semantic_roles/index.en.html +++ b/label_semantic_roles/index.en.html @@ -41,24 +41,24 @@