diff --git a/.gitignore b/.gitignore
index 0a0dd02414c32ede8d58d2556709827f9a98bf5c..b7dfec34eb31252fcb42b3bab9f70e3a0676ba22 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
+deprecated
+*~
pandoc.template
.DS_Store
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0d8234b69cb092a25eb884a754600168f9a67f75..47d837aecba9767f09ef9a9a7da0c8049b4b7878 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,16 +1,34 @@
-- repo: https://github.com/Lucas-C/pre-commit-hooks.git
- sha: c25201a00e6b0514370501050cf2a8538ac12270
- hooks:
- - id: remove-crlf
- repo: https://github.com/reyoung/mirrors-yapf.git
sha: v0.13.2
hooks:
- id: yapf
files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$ # Bazel BUILD files follow Python syntax.
- repo: https://github.com/pre-commit/pre-commit-hooks
- sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469
+ sha: v0.7.1
hooks:
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
- id: end-of-file-fixer
+ files: \.md$
+ - id: trailing-whitespace
+ files: \.md$
+- repo: git://github.com/Lucas-C/pre-commit-hooks
+ sha: v1.0.1
+ hooks:
+ - id: forbid-crlf
+ files: \.md$
+ - id: remove-crlf
+ files: \.md$
+ - id: forbid-tabs
+ files: \.md$
+ - id: remove-tabs
+ files: \.md$
+- repo: local
+ hooks:
+ - id: convert-markdown-into-html
+ name: convert-markdown-into-html
+ description: "Convert README.md into index.html and README.en.md into index.en.html"
+ entry: python pre-commit-hooks/convert_markdown_into_html.py
+ language: system
+ files: \.md$
diff --git a/.tmpl/marked.js b/.tmpl/marked.js
index 3c4fbe885422d11cdfdea4dfcdb71c3f42ef2022..0499d1d4e383ee3f866b9f9eed91ae775fe3da10 100644
--- a/.tmpl/marked.js
+++ b/.tmpl/marked.js
@@ -1093,7 +1093,7 @@ function escape(html, encode) {
}
function unescape(html) {
- // explicitly match decimal, hex, and named HTML entities
+ // explicitly match decimal, hex, and named HTML entities
return html.replace(/&(#(?:\d+)|(?:#x[0-9A-Fa-f]+)|(?:\w+));?/g, function(_, n) {
n = n.toLowerCase();
if (n === 'colon') return ':';
diff --git a/build.sh b/build.sh
deleted file mode 100755
index 8497a3db15496faba30b245db9189c1916d1f374..0000000000000000000000000000000000000000
--- a/build.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-for i in $(du -a | grep '\.\/.\+\/README.md' | cut -f 2); do
- .tmpl/convert-markdown-into-html.sh $i > $(dirname $i)/index.html
-done
-
-for i in $(du -a | grep '\.\/.\+\/README.en.md' | cut -f 2); do
- .tmpl/convert-markdown-into-html.sh $i > $(dirname $i)/index.en.html
-done
diff --git a/fit_a_line/README.en.md b/fit_a_line/README.en.md
index a804ca9192d4df295bce81d9b95f1c69e9478439..29aabdab045feeea895b5c3c1e3bd2840f170310 100644
--- a/fit_a_line/README.en.md
+++ b/fit_a_line/README.en.md
@@ -1,59 +1,73 @@
# Linear Regression
-Let us begin the tutorial with a classical problem called Linear Regression \[[1](#References)\]. In this chapter, we will train a model from a realistic dataset to predict house prices. Some important concepts in Machine Learning will be covered through this example.
+Let us begin the tutorial with a classical problem called Linear Regression \[[1](#References)\]. In this chapter, we will train a model from a realistic dataset to predict home prices. Some important concepts in Machine Learning will be covered through this example.
-The source code for this tutorial is at [book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/fit_a_line). If this is your first time using PaddlePaddle, please refer to the [Install Guide](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html).
+The source code for this tutorial lives on [book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/fit_a_line). For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html).
-## Problem
-Suppose we have a dataset of $n$ houses. Each house $i$ has $d$ properties and the price $y_i$. A property $x_{i,d}$ describes one aspect of the house, for example, the number of rooms in the house, the number of schools or hospitals in the neighborhood, the nearby traffic condition, etc. Our task is to predict $y_i$ given a set of properties $\{x_{i,1}, ..., x_{i,d}\}$. We assume that the price is a linear combination of all the properties, i.e.,
+## Problem Setup
+Suppose we have a dataset of $n$ real estate properties. These real estate properties will be referred to as *homes* in this chapter for clarity.
+
+Each home is associated with $d$ attributes. The attributes describe characteristics such the number of rooms in the home, the number of schools or hospitals in the neighborhood, and the traffic condition nearby.
+
+In our problem setup, the attribute $x_{i,j}$ denotes the $j$th characteristic of the $i$th home. In addition, $y_i$ denotes the price of the $i$th home. Our task is to predict $y_i$ given a set of attributes $\{x_{i,1}, ..., x_{i,d}\}$. We assume that the price of a home is a linear combination of all of its attributes, namely,
$$y_i = \omega_1x_{i,1} + \omega_2x_{i,2} + \ldots + \omega_dx_{i,d} + b, i=1,\ldots,n$$
-where $\omega_{d}$ and $b$ are the model parameters we want to estimate. Once they are learned, given a set of properties of a house, we will be able to predict a price for that house. The model we have here is called Linear Regression, namely, we want to regress a value as a linear combination of several values. In practice this linear model for our problem is hardly true, because the real relationship between the house properties and the price is much more complicated. However, due to its simple formulation which makes the model training and analysis easy, Linear Regression has been applied to lots of real problems. It is always an important topic in many classical Statistical Learning and Machine Learning textbooks \[[2,3,4](#References)\].
+where $\vec{\omega}$ and $b$ are the model parameters we want to estimate. Once they are learned, we will be able to predict the price of a home, given the attributes associated with it. We call this model **Linear Regression**. In other words, we want to regress a value against several values linearly. In practice, a linear model is often too simplistic to capture the real relationships between the variables. Yet, because Linear Regression is easy to train and analyze, it has been applied to a large number of real problems. As a result, it is an important topic in many classic Statistical Learning and Machine Learning textbooks \[[2,3,4](#References)\].
## Results Demonstration
-We first show the training result of our model. We use the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) to train a linear model and predict the house prices in Boston. The figure below shows the predictions the model makes for some house prices. The $X$ coordinate of each point represents the median value of the prices of a certain type of houses, while the $Y$ coordinate represents the predicted value by our linear model. When $X=Y$, the point lies exactly on the dotted line. In other words, the more precise the model predicts, the closer the point is to the dotted line.
+We first show the result of our model. The dataset [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) is used to train a linear model to predict the home prices in Boston. The figure below shows the predictions the model makes for some home prices. The $X$-axis represents the median value of the prices of simlilar homes within a bin, while the $Y$-axis represents the home value our linear model predicts. The dotted line represents points where $X=Y$. When reading the diagram, the more precise the model predicts, the closer the point is to the dotted line.
-
- Figure 1. Predicted Value V.S. Actual Value
+
+ Figure 1. Predicted Value V.S. Actual Value
## Model Overview
### Model Definition
-In the UCI Housing Data Set, there are 13 house properties $x_{i,d}$ that are related to the median house price $y_i$. Thus our model is:
+In the UCI Housing Data Set, there are 13 home attributes $\{x_{i,j}\}$ that are related to the median home price $y_i$, which we aim to predict. Thus, our model can be written as:
$$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$
-where $\hat{Y}$ is the predicted value used to differentiate from the actual value $Y$. The model parameters to be learned are: $\omega_1, \ldots, \omega_{13}, b$, where $\omega$ are called the weights and $b$ is called the bias.
+where $\hat{Y}$ is the predicted value used to differentiate from actual value $Y$. The model learns parameters $\omega_1, \ldots, \omega_{13}, b$, where the entries of $\vec{\omega}$ are **weights** and $b$ is **bias**.
-Now we need an optimization goal, so that with the learned parameters, $\hat{Y}$ is close to $Y$ as much as possible. Here we introduce the concept of [Loss Function (Cost Function)](https://en.wikipedia.org/wiki/Loss_function). The Loss Function has such property: given any pair of the actual value $y_i$ and the predicted value $\hat{y_i}$, its output is always non-negative. This non-negative value reflects the model error.
+Now we need an objective to optimize, so that the learned parameters can make $\hat{Y}$ as close to $Y$ as possible. Let's refer to the concept of [Loss Function (Cost Function)](https://en.wikipedia.org/wiki/Loss_function). A loss function must output a non-negative value, given any pair of the actual value $y_i$ and the predicted value $\hat{y_i}$. This value reflects the magnitutude of the model error.
-For Linear Regression, the most common Loss Function is [Mean Square Error (MSE)](https://en.wikipedia.org/wiki/Mean_squared_error) which has the following form:
+For Linear Regression, the most common loss function is [Mean Square Error (MSE)](https://en.wikipedia.org/wiki/Mean_squared_error) which has the following form:
$$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
-For a dataset of size $n$, MSE is the average value of the $n$ predicted errors.
+That is, for a dataset of size $n$, MSE is the average value of the the prediction sqaure errors.
### Training
-After defining our model, we have several major steps for the training:
-1. Initialize the parameters including the weights $\omega$ and the bias $b$. For example, we can set their mean values as 0s, and their standard deviations as 1s.
-2. Feedforward to compute the network output and the Loss Function.
-3. Backward to [backpropagate](https://en.wikipedia.org/wiki/Backpropagation) the errors. The errors will be propagated from the output layer back to the input layer, during which the model parameters will be updated with the corresponding errors.
+After setting up our model, there are several major steps to go through to train it:
+1. Initialize the parameters including the weights $\vec{\omega}$ and the bias $b$. For example, we can set their mean values as $0$s, and their standard deviations as $1$s.
+2. Feedforward. Evaluate the network output and compute the corresponding loss.
+3. [Backpropagate](https://en.wikipedia.org/wiki/Backpropagation) the errors. The errors will be propagated from the output layer back to the input layer, during which the model parameters will be updated with the corresponding errors.
4. Repeat steps 2~3, until the loss is below a predefined threshold or the maximum number of repeats is reached.
-## Data Preparation
-Follow the command below to prepare data:
-```bash
-cd data && python prepare_data.py
+## Dataset
+
+### Python Dataset Modules
+
+Our program starts with importing necessary packages:
+
+```python
+import paddle.v2 as paddle
+import paddle.v2.dataset.uci_housing as uci_housing
```
-This line of code will download the dataset from the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) and perform some [preprocessing](#Preprocessing). The dataset is split into a training set and a test set.
-The dataset contains 506 lines in total, each line describing the properties and the median price of a certain type of houses in Boston. The meaning of each line is below:
+We encapsulated the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) in our Python module `uci_housing`. This module can
+1. download the dataset to `~/.cache/paddle/dataset/uci_housing/housing.data`, if not yet, and
+2. [preprocesses](#preprocessing) the dataset.
-| Property Name | Explanation | Data Type |
+### An Introduction of the Dataset
+
+The UCI housing dataset has 506 instances. Each instance describes the attributes of a house in surburban Boston. The attributes are explained below:
+
+| Attribute Name | Characteristic | Data Type |
| ------| ------ | ------ |
| CRIM | per capita crime rate by town | Continuous|
| ZN | proportion of residential land zoned for lots over 25,000 sq.ft. | Continuous |
@@ -70,113 +84,115 @@ The dataset contains 506 lines in total, each line describing the properties and
| LSTAT | % lower status of the population | Continuous |
| MEDV | Median value of owner-occupied homes in $1000's | Continuous |
-The last entry is the median house price.
+The last entry is the median home price.
### Preprocessing
#### Continuous and Discrete Data
-We define a feature vector of length 13 for each house, where each entry of the feature vector corresponds to a property of that house. Our first observation is that among the 13 dimensions, there are 12 continuous dimensions and 1 discrete dimension. Note that although a discrete value is also written as digits such as 0, 1, or 2, it has a quite different meaning from a continuous value. The reason is that the difference between two discrete values has no practical meaning. For example, if we use 0, 1, and 2 to represent `red`, `green`, and `blue` respectively, although the numerical difference between `red` and `green` is smaller than that between `red` and `blue`, we cannot say that the extent to which `blue` is different from `red` is greater than the extent to which `green` is different from `red`. Therefore, when handling a discrete feature that has $d$ possible values, we will usually convert it to $d$ new features where each feature can only take 0 or 1, indicating whether the original $d$th value is present or not. Or we can map the discrete feature to a continuous multi-dimensional vector through an embedding table. For our problem here, because CHAS itself is a binary discrete value, we do not need to do any preprocessing.
+We define a feature vector of length 13 for each home, where each entry corresponds to an attribute. Our first observation is that, among the 13 dimensions, there are 12 continuous dimensions and 1 discrete dimension.
+
+Note that although a discrete value is also written as numeric values such as 0, 1, or 2, its meaning differs from a continuous value drastically. The linear difference between two discrete values has no meaning. For example, suppose $0$, $1$, and $2$ are used to represent colors *Red*, *Green*, and *Blue* respectively. Judging from the numeric representation of these colors, *Red* differs more from *Blue* than it does from *Green*. Yet in actuality, it is not true that extent to which the color *Blue* is different from *Red* is greater than the extent to which *Green* is different from *Red*. Therefore, when handling a discrete feature that has $d$ possible values, we usually convert it to $d$ new features where each feature takes a binary value, $0$ or $1$, indicating whether the original value is absent or present. Alternatively, the discrete features can be mapped onto a continuous multi-dimensional vector through an embedding table. For our problem here, because CHAS itself is a binary discrete value, we do not need to do any preprocessing.
#### Feature Normalization
-Another observation we have is that there is a huge difference among the value ranges of the 13 features (Figure 2). For example, feature B has a value range of [0.32, 396.90] while feature NOX has a range of [0.3850, 0.8170]. For an effective optimization, here we need data normalization. The goal of data normalization is to scale each feature into roughly the same value range, for example [-0.5, 0.5]. In this example, we adopt a standard way of normalization: substracting the mean value from the feature and divide the result by the original value range.
+We also observe a huge difference among the value ranges of the 13 features (Figure 2). For instance, the values of feature *B* fall in $[0.32, 396.90]$, whereas those of feature *NOX* has a range of $[0.3850, 0.8170]$. An effective optimization would require data normalization. The goal of data normalization is to scale te values of each feature into roughly the same range, perhaps $[-0.5, 0.5]$. Here, we adopt a popular normalization technique where we substract the mean value from the feature value and divide the result by the width of the original range.
There are at least three reasons for [Feature Normalization](https://en.wikipedia.org/wiki/Feature_scaling) (Feature Scaling):
- A value range that is too large or too small might cause floating number overflow or underflow during computation.
-- Different value ranges might result in different importances of different features to the model (at least in the beginning of the training process), which however is an unreasonable assumption. Such assumption makes the optimization more difficult and increases the training time a lot.
-- Many Machine Learning techniques or models (e.g., L1/L2 regularization and Vector Space Model) are based on the assumption that all the features have roughly zero means and their value ranges are similar.
+- Different value ranges might result in varying *importances* of different features to the model (at least in the beginning of the training process). This assumption about the data is often unreasonable, making the optimization difficult, which in turn results in increased training time.
+- Many machine learning techniques or models (e.g., *L1/L2 regularization* and *Vector Space Model*) assumes that all the features have roughly zero means and their value ranges are similar.
-
- Figure 2. The value ranges of the features
+
+ Figure 2. The value ranges of the features
#### Prepare Training and Test Sets
-We split the dataset into two subsets, one for estimating the model parameters, namely, model training, and the other for model testing. The model error on the former is called the **training error**, and the error on the latter is called the **test error**. Our goal of training a model is to find the statistical dependency between the outputs and the inputs, so that we can predict new outputs given new inputs. As a result, the test error reflects the performance of the model better than the training error does. We consider two things when deciding the ratio of the training set to the test set: 1) More training data will decrease the variance of the parameter estimation, yielding more reliable models; 2) More test data will decrease the variance of the test error, yielding more reliable test errors. One standard split ratio is $8:2$. You can try different split ratios to observe how the two variances change.
+We split the dataset in two, one for adjusting the model parameters, namely, for model training, and the other for model testing. The model error on the former is called the **training error**, and the error on the latter is called the **test error**. Our goal in training a model is to find the statistical dependency between the outputs and the inputs, so that we can predict new outputs given new inputs. As a result, the test error reflects the performance of the model better than the training error does. We consider two things when deciding the ratio of the training set to the test set: 1) More training data will decrease the variance of the parameter estimation, yielding more reliable models; 2) More test data will decrease the variance of the test error, yielding more reliable test errors. One standard split ratio is $8:2$.
+
+
+When training complex models, we usually have one more split: the validation set. Complex models usually have [Hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_optimization) that need to be set before the training process, such as the number of layers in the network. Because hyperparameters are not part of the model parameters, they cannot be trained using the same loss function. Thus we will try several sets of hyperparameters to train several models and cross-validate them on the validation set to pick the best one; finally, the selected trained model is tested on the test set. Because our model is relatively simple, we will omit this validation process.
+
+
+## Training
+
+`fit_a_line/trainer.py` demonstrates the training using [PaddlePaddle](http://paddlepaddle.org).
+
+### Initialize PaddlePaddle
-Executing the following command to split the dataset and write the training and test set into the `train.list` and `test.list` files, so that later PaddlePaddle can read from them.
```python
-python prepare_data.py -r 0.8 #8:2 is the default split ratio
+paddle.init(use_gpu=False, trainer_count=1)
```
-When training complex models, we usually have one more split: the validation set. Complex models usually have [Hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_optimization) that need to be set before the training process begins. These hyperparameters are not part of the model parameters and cannot be trained using the same Loss Function (e.g., the number of layers in the network). Thus we will try several sets of hyperparameters to get several models, and compare these trained models on the validation set to pick the best one, and finally it on the test set. Because our model is relatively simple in this problem, we ignore this validation process for now.
+### Model Configuration
-### Provide Data to PaddlePaddle
-After the data is prepared, we use a Python Data Provider to provide data for PaddlePaddle. A Data Provider is a Python function which will be called by PaddlePaddle during training. In this example, the Data Provider only needs to read the data and return it to the training process of PaddlePaddle line by line.
+Logistic regression is essentially a fully-connected layer with linear activation:
```python
-from paddle.trainer.PyDataProvider2 import *
-import numpy as np
-#define data type and dimensionality
-@provider(input_types=[dense_vector(13), dense_vector(1)])
-def process(settings, input_file):
- data = np.load(input_file.strip())
- for row in data:
- yield row[:-1].tolist(), row[-1:].tolist()
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+y_predict = paddle.layer.fc(input=x,
+ size=1,
+ act=paddle.activation.Linear())
+y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+cost = paddle.layer.regression_cost(input=y_predict, label=y)
+```
+### Create Parameters
+```python
+parameters = paddle.parameters.create(cost)
```
-## Model Configuration
+### Create Trainer
-### Data Definition
-We first call the function `define_py_data_sources2` to let PaddlePaddle read training and test data from the `dataprovider.py` in the above. PaddlePaddle can accept configuration info from the command line, for example, here we pass a variable named `is_predict` to control the model to have different structures during training and test.
```python
-from paddle.trainer_config_helpers import *
+optimizer = paddle.optimizer.Momentum(momentum=0)
-is_predict = get_config_arg('is_predict', bool, False)
+trainer = paddle.trainer.SGD(cost=cost,
+ parameters=parameters,
+ update_equation=optimizer)
+```
-define_py_data_sources2(
- train_list='data/train.list',
- test_list='data/test.list',
- module='dataprovider',
- obj='process')
+### Feeding Data
-```
+PaddlePaddle provides the
+[reader mechanism](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader)
+for loadinng training data. A reader may return multiple columns, and we need a Python dictionary to specify the mapping from column index to data layers.
-### Algorithm Settings
-Next we need to set the details of the optimization algorithm. Due to the simplicity of the Linear Regression model, we only need to set the `batch_size` which defines how many samples are used every time for updating the parameters.
```python
-settings(batch_size=2)
+feeding={'x': 0, 'y': 1}
```
-### Network
-Finally, we use `fc_layer` and `LinearActivation` to represent the Linear Regression model.
+Moreover, an event handler is provided to print the training progress:
+
```python
-#input data of 13 dimensional house information
-x = data_layer(name='x', size=13)
-
-y_predict = fc_layer(
- input=x,
- param_attr=ParamAttr(name='w'),
- size=1,
- act=LinearActivation(),
- bias_attr=ParamAttr(name='b'))
-
-if not is_predict: #when training, we use MSE (i.e., regression_cost) as the Loss Function
- y = data_layer(name='y', size=1)
- cost = regression_cost(input=y_predict, label=y)
- outputs(cost) #output MSE to view the loss change
-else: #during test, output the prediction value
- outputs(y_predict)
+# event_handler to print training and testing info
+def event_handler(event):
+ if isinstance(event, paddle.event.EndIteration):
+ if event.batch_id % 100 == 0:
+ print "Pass %d, Batch %d, Cost %f" % (
+ event.pass_id, event.batch_id, event.cost)
+
+ if isinstance(event, paddle.event.EndPass):
+ result = trainer.test(
+ reader=paddle.batch(
+ uci_housing.test(), batch_size=2),
+ feeding=feeding)
+ print "Test %d, Cost %f" % (event.pass_id, result.cost)
```
-## Training Model
-We can run the PaddlePaddle command line trainer in the root directory of the code. Here we name the configuration file as `trainer_config.py`. We train 30 passes and save the result in the directory `output`:
-```bash
-./train.sh
-```
+### Start Training
-## Use Model
-Now we can use the trained model to do prediction.
-```bash
-python predict.py
-```
-Here by default we use the model in `output/pass-00029` for prediction, and compare the actual house price with the predicted one. The result is shown in `predictions.png`.
-If you want to use another model or test on other data, you can pass in a new model path or data path:
-```bash
-python predict.py -m output/pass-00020 -t data/housing.test.npy
+```python
+trainer.train(
+ reader=paddle.batch(
+ paddle.reader.shuffle(
+ uci_housing.train(), buf_size=500),
+ batch_size=2),
+ feeding=feeding,
+ event_handler=event_handler,
+ num_passes=30)
```
## Summary
-In this chapter, we have introduced the Linear Regression model using the UCI Housing Data Set as an example. We have shown how to train and test this model with PaddlePaddle. Many more complex models and techniques are derived from this simple linear model, thus it is important for us to understand how it works.
+This chapter introduces *Linear Regression* and how to train and test this model with PaddlePaddle, using the UCI Housing Data Set. Because a large number of more complex models and techniques are derived from linear regression, it is important to understand its underlying theory and limitation.
## References
@@ -186,4 +202,4 @@ In this chapter, we have introduced the Linear Regression model using the UCI Ho
4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128.
- 本教程 由 PaddlePaddle 创作,采用 知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议进行许可。
+ This tutorial was created and published with [Creative Common License 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/).
diff --git a/fit_a_line/README.md b/fit_a_line/README.md
index f2e3243a3d1b91df5b8c9bfaa5da74fd142a63b3..266c6e91cb4d5249997203cada7ef920d3744386 100644
--- a/fit_a_line/README.md
+++ b/fit_a_line/README.md
@@ -39,16 +39,16 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
### 训练过程
-定义好模型结构之后,我们要通过以下几个步骤进行模型训练
- 1. 初始化参数,其中包括权重$\omega_i$和偏置$b$,对其进行初始化(如0均值,1方差)。
- 2. 网络正向传播计算网络输出和损失函数。
- 3. 根据损失函数进行反向误差传播 ([backpropagation](https://en.wikipedia.org/wiki/Backpropagation)),将网络误差从输出层依次向前传递, 并更新网络中的参数。
- 4. 重复2~3步骤,直至网络训练误差达到规定的程度或训练轮次达到设定值。
-
+定义好模型结构之后,我们要通过以下几个步骤进行模型训练
+ 1. 初始化参数,其中包括权重$\omega_i$和偏置$b$,对其进行初始化(如0均值,1方差)。
+ 2. 网络正向传播计算网络输出和损失函数。
+ 3. 根据损失函数进行反向误差传播 ([backpropagation](https://en.wikipedia.org/wiki/Backpropagation)),将网络误差从输出层依次向前传递, 并更新网络中的参数。
+ 4. 重复2~3步骤,直至网络训练误差达到规定的程度或训练轮次达到设定值。
+
## 数据集
### 数据集接口的封装
-首先加载需要的包
+首先加载需要的包
```python
import paddle.v2 as paddle
@@ -59,9 +59,8 @@ import paddle.v2.dataset.uci_housing as uci_housing
其中,在uci_housing模块中封装了:
-1. 数据下载的过程
- 下载数据保存在~/.cache/paddle/dataset/uci_housing/housing.data
-2. [数据预处理](#数据预处理)的过程
+1. 数据下载的过程。下载数据保存在~/.cache/paddle/dataset/uci_housing/housing.data。
+2. [数据预处理](#数据预处理)的过程。
### 数据集介绍
@@ -105,25 +104,23 @@ import paddle.v2.dataset.uci_housing as uci_housing
我们将数据集分割为两份:一份用于调整模型的参数,即进行模型的训练,模型在这份数据集上的误差被称为**训练误差**;另外一份被用来测试,模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据,所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素:更多的训练数据会降低参数估计的方差,从而得到更可信的模型;而更多的测试数据会降低测试误差的方差,从而得到更可信的测试误差。我们这个例子中设置的分割比例为$8:2$
-
在更复杂的模型训练过程中,我们往往还会多使用一种数据集:验证集。因为复杂的模型中常常还有一些超参数([Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization))需要调节,所以我们会尝试多种超参数的组合来分别训练多个模型,然后对比它们在验证集上的表现选择相对最好的一组超参数,最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单,我们暂且忽略掉这个过程。
## 训练
-fit_a_line下trainer.py演示了训练的整体过程
-### 初始化paddlepaddle
+`fit_a_line/trainer.py`演示了训练的整体过程。
+
+### 初始化PaddlePaddle
```python
-# init
paddle.init(use_gpu=False, trainer_count=1)
```
-### 模型配置
+### 模型配置
-使用`fc_layer`和`LinearActivation`来表示线性回归的模型本身。
+线性回归的模型其实就是一个采用线性激活函数(linear activation,`LinearActivation`)的全连接层(fully-connected layer,`fc_layer`):
```python
-#输入数据,13维的房屋信息
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x,
size=1,
@@ -131,17 +128,15 @@ y_predict = paddle.layer.fc(input=x,
y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
cost = paddle.layer.regression_cost(input=y_predict, label=y)
```
-### 创建参数
+### 创建参数
```python
-# create parameters
parameters = paddle.parameters.create(cost)
```
-### 创建trainer
+### 创建Trainer
```python
-# create optimizer
optimizer = paddle.optimizer.Momentum(momentum=0)
trainer = paddle.trainer.SGD(cost=cost,
@@ -149,14 +144,20 @@ trainer = paddle.trainer.SGD(cost=cost,
update_equation=optimizer)
```
-### 读取数据且打印训练的中间信息
-在程序中,我们通过reader接口来获取训练或者测试的数据,通过eventhandler来打印训练的中间信息
-feeding中设置了训练数据和测试数据的下标,reader通过下标区分训练和测试数据。
+### 读取数据且打印训练的中间信息
+
+PaddlePaddle提供一个
+[reader机制](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader)
+来读取数据。 Reader返回的数据可以包括多列,我们需要一个Python dict把列
+序号映射到网络里的数据层。
```python
-feeding={'x': 0,
- 'y': 1}
+feeding={'x': 0, 'y': 1}
+```
+
+此外,我们还可以提供一个 event handler,来打印训练的进度:
+```python
# event_handler to print training and testing info
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
@@ -171,10 +172,10 @@ def event_handler(event):
feeding=feeding)
print "Test %d, Cost %f" % (event.pass_id, result.cost)
```
-### 开始训练
+
+### 开始训练
```python
-# training
trainer.train(
reader=paddle.batch(
paddle.reader.shuffle(
@@ -185,13 +186,6 @@ trainer.train(
num_passes=30)
```
-## bash中执行训练程序
-**注意设置好paddle的安装包路径**
-
-```bash
-python train.py
-```
-
## 总结
在这章里,我们借助波士顿房价这一数据集,介绍了线性回归模型的基本概念,以及如何使用PaddlePaddle实现训练和测试的过程。很多的模型和技巧都是从简单的线性回归模型演化而来,因此弄清楚线性模型的原理和局限非常重要。
diff --git a/fit_a_line/index.en.html b/fit_a_line/index.en.html
index b2492b2c8d0ab1126ba444acc669102bc02ebdfb..3039e0384bcda3502a04f6be03be4127506735e4 100644
--- a/fit_a_line/index.en.html
+++ b/fit_a_line/index.en.html
@@ -1,3 +1,4 @@
+
diff --git a/fit_a_line/index.html b/fit_a_line/index.html
index 7bb9e8a2f6b69b766f17eb72d5cd9d9844138b2d..7bcd415d8d7563a228303ed0679b5f3c3599a14d 100644
--- a/fit_a_line/index.html
+++ b/fit_a_line/index.html
@@ -1,3 +1,4 @@
+
diff --git a/gan/index.html b/gan/index.html
index 1f88c3593d9d0027c58c83fb78543aae95c2b5b5..4cee22efeb2f27936b87292d7db23dd1f7cb1bad 100644
--- a/gan/index.html
+++ b/gan/index.html
@@ -1,3 +1,4 @@
+
diff --git a/image_caption/index.html b/image_caption/index.html
index bd5f85aff99e291b01dc091f7a3d4ac622bce4a6..eb2c91f9e3a7aba340da33415b5a3fbb5ef6d32b 100644
--- a/image_caption/index.html
+++ b/image_caption/index.html
@@ -1,3 +1,4 @@
+
diff --git a/image_classification/README.en.md b/image_classification/README.en.md
index 26b0caeae0173ffa3b96c138a0e991759dd4d8f6..3cfc40cbe339d274cf055d62ba1b7c9d9e7955a5 100644
--- a/image_classification/README.en.md
+++ b/image_classification/README.en.md
@@ -248,48 +248,48 @@ First we define VGG network. Since the image size and amount of CIFAR10 are rela
The input to the network is defined as `data_layer`, or image pixels in the context of image classification. The images in CIFAR10 are 32x32 color images of three channels. Therefore, the size of the input data is 3072 (3x32x32), and the number of categories is 10.
- ```python
- datadim = 3 * 32 * 32
- classdim = 10
- data = data_layer(name='image', size=datadim)
- ```
+ ```python
+ datadim = 3 * 32 * 32
+ classdim = 10
+ data = data_layer(name='image', size=datadim)
+ ```
2. Define VGG main module
- ```python
- net = vgg_bn_drop(data)
- ```
+ ```python
+ net = vgg_bn_drop(data)
+ ```
The input to VGG main module is from data layer. `vgg_bn_drop` defines a 16-layer VGG network, with each convolutional layer followed by BN and dropout layers. Here is the definition in detail:
- ```python
- def vgg_bn_drop(input, num_channels):
- def conv_block(ipt, num_filter, groups, dropouts, num_channels_=None):
- return img_conv_group(
- input=ipt,
- num_channels=num_channels_,
- pool_size=2,
- pool_stride=2,
- conv_num_filter=[num_filter] * groups,
- conv_filter_size=3,
- conv_act=ReluActivation(),
- conv_with_batchnorm=True,
- conv_batchnorm_drop_rate=dropouts,
- pool_type=MaxPooling())
-
- conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
- conv2 = conv_block(conv1, 128, 2, [0.4, 0])
- conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
- conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
- conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
- drop = dropout_layer(input=conv5, dropout_rate=0.5)
- fc1 = fc_layer(input=drop, size=512, act=LinearActivation())
- bn = batch_norm_layer(
- input=fc1, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5))
- fc2 = fc_layer(input=bn, size=512, act=LinearActivation())
- return fc2
-
- ```
+ ```python
+ def vgg_bn_drop(input, num_channels):
+ def conv_block(ipt, num_filter, groups, dropouts, num_channels_=None):
+ return img_conv_group(
+ input=ipt,
+ num_channels=num_channels_,
+ pool_size=2,
+ pool_stride=2,
+ conv_num_filter=[num_filter] * groups,
+ conv_filter_size=3,
+ conv_act=ReluActivation(),
+ conv_with_batchnorm=True,
+ conv_batchnorm_drop_rate=dropouts,
+ pool_type=MaxPooling())
+
+ conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
+ conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+ conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+ conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+ conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+ drop = dropout_layer(input=conv5, dropout_rate=0.5)
+ fc1 = fc_layer(input=drop, size=512, act=LinearActivation())
+ bn = batch_norm_layer(
+ input=fc1, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5))
+ fc2 = fc_layer(input=bn, size=512, act=LinearActivation())
+ return fc2
+
+ ```
2.1. First defines a convolution block or conv_block. The default convolution kernel is 3x3, and the default pooling size is 2x2 with stride 2. Dropout specifies the probability in dropout operation. Function `img_conv_group` is defined in `paddle.trainer_config_helpers` consisting of a series of `Conv->BN->ReLu->Dropout` and a `Pooling`.
@@ -303,22 +303,22 @@ First we define VGG network. Since the image size and amount of CIFAR10 are rela
The above VGG network extracts high-level features and maps them to a vector of the same size as the categories. Softmax function or classifier is then used for calculating the probability of the image belonging to each category.
- ```python
- out = fc_layer(input=net, size=class_num, act=SoftmaxActivation())
- ```
+ ```python
+ out = fc_layer(input=net, size=class_num, act=SoftmaxActivation())
+ ```
4. Define Loss Function and Outputs
In the context of supervised learning, labels of training images are defined in `data_layer`, too. During training, cross-entropy is used as loss function and as the output of the network; During testing, the outputs are the probabilities calculated in the classifier.
- ```python
- if not is_predict:
- lbl = data_layer(name="label", size=class_num)
- cost = classification_cost(input=out, label=lbl)
- outputs(cost)
- else:
- outputs(out)
- ```
+ ```python
+ if not is_predict:
+ lbl = data_layer(name="label", size=class_num)
+ cost = classification_cost(input=out, label=lbl)
+ outputs(cost)
+ else:
+ outputs(out)
+ ```
### ResNet
diff --git a/image_classification/README.md b/image_classification/README.md
index 538760d429d6f250eed2ce578e10001526d11abb..829e99a2b9cb9819d87df9bfc53dd81f1a2a6147 100644
--- a/image_classification/README.md
+++ b/image_classification/README.md
@@ -3,7 +3,7 @@
本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/image_classification), 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
-## 背景介绍
+## 背景介绍
图像相比文字能够提供更加生动、容易理解及更具艺术感的信息,是人们转递与交换信息的重要来源。在本教程中,我们专注于图像识别领域的一个重要问题,即图像分类。
@@ -51,7 +51,7 @@
2). **特征编码**: 底层特征中包含了大量冗余与噪声,为了提高特征表达的鲁棒性,需要使用一种特征变换算法对底层特征进行编码,称作特征编码。常用的特征编码包括向量量化编码 \[[4](#参考文献)\]、稀疏编码 \[[5](#参考文献)\]、局部线性约束编码 \[[6](#参考文献)\]、Fisher向量编码 \[[7](#参考文献)\] 等。
3). **空间特征约束**: 特征编码之后一般会经过空间特征约束,也称作**特征汇聚**。特征汇聚是指在一个空间范围内,对每一维特征取最大值或者平均值,可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法,这种方法提出将图像均匀分块,在分块内做特征汇聚。
4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述,接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器,在传统图像分类任务上性能很好。
-
+
这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \[[18](#参考文献)\]。[NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征,两个非线性编码器以及SVM分类器获得图像分类的冠军 \[[8](#参考文献)\]。
Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破,效果大幅度超越传统方法,获得了ILSVRC2012冠军,该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后,涌现了一系列CNN模型,不断地在ImageNet上刷新成绩,如图4展示。随着模型变得越来越深以及精妙的结构设计,Top-5的错误率也越来越低,降到了3.5%附近。而在同样的ImageNet数据集上,人眼的辨识错误率大概在5.1%,也就是目前的深度学习模型的识别能力已经超过了人眼。
@@ -67,8 +67,8 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得
+
+
+
diff --git a/label_semantic_roles/README.en.md b/label_semantic_roles/README.en.md
index d172d8b63de3f6c5a9016da4524a0a339d4696c9..d43b8c931e6c4e68a3de115d177edfbcc6df57ee 100644
--- a/label_semantic_roles/README.en.md
+++ b/label_semantic_roles/README.en.md
@@ -22,34 +22,20 @@ Standard SRL system mostly builds on top of Syntactic Analysis and contains five
-
+
Fig 1. Syntactic parse tree
-核心关系-> HED
-定中关系-> ATT
-主谓关系-> SBV
-状中结构-> ADV
-介宾关系-> POB
-右附加关系-> RAD
-动宾关系-> VOB
-标点-> WP
-
However, complete syntactic analysis requires identifying the relation among all constitutes and the performance of SRL is sensitive to the precision of syntactic analysis, which makes SRL a very challenging task. To reduce the complexity and obtain some syntactic structure information, we often use shallow syntactic analysis. Shallow Syntactic Analysis is also called partial parsing or chunking. Unlike complete syntactic analysis which requires the construction of the complete parsing tree, Shallow Syntactic Analysis only need to identify some independent components with relatively simple structure, such as verb phrases (chunk). To avoid difficulties in constructing a syntactic tree with high accuracy, some work\[[1](#Reference)\] proposed semantic chunking based SRL methods, which convert SRL as a sequence tagging problem. Sequence tagging tasks classify syntactic chunks using BIO representation. For syntactic chunks forming a chunk of type A, the first chunk receives the B-A tag (Begin), the remaining ones receive the tag I-A (Inside), and all chunks outside receive the tag O-A.
The BIO representation of above example is shown in Fig.1.
-
+
Fig 2. BIO represention
-输入序列-> input sequence
-语块-> chunk
-标注序列-> label sequence
-角色-> role
-
This example illustrates the simplicity of sequence tagging because (1) shallow syntactic analysis reduces the precision requirement of syntactic analysis; (2) pruning candidate arguments is removed; 3) argument identification and tagging are finished at the same time. Such unified methods simplify the procedure, reduce the risk of accumulating errors and boost the performance further.
In this tutorial, our SRL system is built as an end-to-end system via a neural network. We take only text sequences, without using any syntactic parsing results or complex hand-designed features. We give public dataset [CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/) as an example to illustrate: given a sentence with predicates marked, identify the corresponding arguments and their semantic roles by sequence tagging method.
@@ -70,14 +56,11 @@ The operation of a single LSTM cell contain 3 parts: (1) input-to-hidden: map in
Fig.3 illustrate the final stacked recurrent neural networks.
-
-
+
+
Fig 3. Stacked Recurrent Neural Networks
-线性变换-> linear transformation
-输入层到隐层-> input-to-hidden
-
### Bidirectional Recurrent Neural Network
LSTMs can summarize the history of previous inputs seen up to now, but can not see the future. In most of NLP (natural language processing) tasks, the entire sentences are ready to use. Therefore, sequential learning might be much efficient if the future can be encoded as well like histories.
@@ -85,16 +68,11 @@ LSTMs can summarize the history of previous inputs seen up to now, but can not s
To address the above drawbacks, we can design bidirectional recurrent neural networks by making a minor modification. Higher LSTM layers process the sequence in reversed direction with previous lower LSTM layers, i.e., Deep LSTMs operate from left-to-right, right-to-left, left-to-right,..., in depth. Therefore, LSTM layers at time-step $t$ can see both histories and the future since the second layer. Fig. 4 illustrates the bidirectional recurrent neural networks.
-
-
+
+
Fig 4. Bidirectional LSTMs
-线性变换-> linear transformation
-输入层到隐层-> input-to-hidden
-正向处理输出序列->process sequence in the forward direction
-反向处理上一层序列-> process sequence from the previous layer in backward direction
-
Note that, this bidirectional RNNs is different with the one proposed by Bengio et al. in machine translation tasks \[[3](#Reference), [4](#Reference)\]. We will introduce another bidirectional RNNs in the following tasks[machine translation](https://github.com/PaddlePaddle/book/blob/develop/machine_translation/README.md)
### Conditional Random Field
@@ -106,12 +84,12 @@ CRF is a probabilistic graph model (undirected) with nodes denoting random varia
Sequence tagging tasks only consider input and output as linear sequences without extra dependent assumptions on graph model. Thus, the graph model of sequence tagging tasks is simple chain or line, which results in a Linear-Chain Conditional Random Field, shown in Fig.5.
-
+
Fig 5. Linear Chain Conditional Random Field used in SRL tasks
-By the fundamental theorem of random fields \[[5](#Reference)\], the joint distribution over the label sequence $Y$ given $X$ has the form:
+By the fundamental theorem of random fields \[[5](#Reference)\], the joint distribution over the label sequence $Y$ given $X$ has the form:
$$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$
@@ -155,19 +133,11 @@ After modification, the model is as follows:
4. Take representation from step 3 as input of CRF, label sequence as supervision signal, do sequence tagging tasks
-
-
+
+
Fig 6. DB-LSTM for SRL tasks
-论元-> argu
-谓词-> pred
-谓词上下文-> ctx-p
-谓词上下文区域标记-> $m_r$
-输入-> input
-原句-> sentence
-反向LSTM-> LSTM Reverse
-
## Data Preparation
In the tutorial, we use [CoNLL 2005](http://www.cs.upc.edu/~srlconll/) SRL task open dataset as an example. It is important to note that the training set and development set of the CoNLL 2005 SRL task are not free to download after the competition. Currently, only the test set can be obtained, including 23 sections of the Wall Street Journal and three sections of the Brown corpus. In this tutorial, we use the WSJ corpus as the training dataset to explain the model. However, since the training set is small, if you want to train a usable neural network SRL system, consider paying for the full corpus.
@@ -259,10 +229,10 @@ def d_type(value_range):
# word sequence
word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
# predicate
-predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))
+predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))
# 5 features for predicate context
-ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
+ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
@@ -274,12 +244,12 @@ mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
# label sequence
target = paddle.layer.data(name='target', type=d_type(label_dict_len))
```
-
+
Speciala note: hidden_dim = 512 means LSTM hidden vector of 128 dimension (512/4). Please refer PaddlePaddle official documentation for detail: [lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)。
- 2. The word sequence, predicate, predicate context, and region mark sequence are transformed into embedding vector sequences.
-```python
+```python
# Since word vectorlookup table is pre-trained, we won't update it this time.
# is_static being True prevents updating the lookup table during training.
@@ -405,7 +375,7 @@ parameters = paddle.parameters.create([crf_cost, crf_dec])
```
We can print out parameter name. It will be generated if not specified.
-
+
```python
print parameters.keys()
```
diff --git a/label_semantic_roles/README.md b/label_semantic_roles/README.md
index 5c8f92d7949135a1aefd124ba1a1fc580cbe44c1..9fcf1ae84c3f83d68f0e57be320928ff83000e67 100644
--- a/label_semantic_roles/README.md
+++ b/label_semantic_roles/README.md
@@ -52,7 +52,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
图3是最终得到的栈式循环神经网络结构示意图。
-
@@ -161,7 +161,7 @@ conll05st-release/
预处理完成之后一条训练样本包含9个特征,分别是:句子序列、谓词、谓词上下文(占 5 列)、谓词上下区域标志、标注序列。下表是一条训练样本的示例。
-| 句子序列 | 谓词 | 谓词上下文(窗口 = 5) | 谓词上下文区域标记 | 标注序列 |
+| 句子序列 | 谓词 | 谓词上下文(窗口 = 5) | 谓词上下文区域标记 | 标注序列 |
|---|---|---|---|---|
| A | set | n't been set . × | 0 | B-A1 |
| record | set | n't been set . × | 0 | I-A1 |
@@ -214,7 +214,7 @@ word_dim = 32 # 词向量维度
mark_dim = 5 # 谓词上下文区域通过词表被映射为一个实向量,这个是相邻的维度
hidden_dim = 512 # LSTM隐层向量的维度 : 512 / 4
depth = 8 # 栈式LSTM的深度
-
+
# 一条样本总共9个特征,下面定义了9个data层,每个层类型为integer_value_sequence,表示整数ID的序列类型.
def d_type(size):
return paddle.data_type.integer_value_sequence(size)
@@ -222,10 +222,10 @@ def d_type(size):
# 句子序列
word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
# 谓词
-predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))
+predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))
# 谓词上下文5个特征
-ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
+ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
@@ -237,12 +237,12 @@ mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
# 标注序列
target = paddle.layer.data(name='target', type=d_type(label_dict_len))
```
-
+
这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维,关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。
- 2. 将句子序列、谓词、谓词上下文、谓词上下文区域标记通过词表,转换为实向量表示的词向量序列。
-```python
+```python
# 在本教程中,我们加载了预训练的词向量,这里设置了:is_static=True
# is_static 为 True 时保证了在训练 SRL 模型过程中,词表不再更新
@@ -369,7 +369,7 @@ parameters = paddle.parameters.create([crf_cost, crf_dec])
```
可以打印参数名字,如果在网络配置中没有指定名字,则默认生成。
-
+
```python
print parameters.keys()
```
diff --git a/label_semantic_roles/data/extract_pairs.py b/label_semantic_roles/data/extract_pairs.py
index 94a8488c16734eb1882d54f7ec36f4b9308c09d4..d0290c44cf089990d2cf30137834132cb72ab5ea 100644
--- a/label_semantic_roles/data/extract_pairs.py
+++ b/label_semantic_roles/data/extract_pairs.py
@@ -20,7 +20,7 @@ from optparse import OptionParser
def read_labels(props_file):
'''
a sentence maybe has more than one verb, each verb has its label sequence
- label[], is a 3-dimension list.
+ label[], is a 3-dimension list.
the first dim is to store all sentence's label seqs, len is the sentence number
the second dim is to store all label sequences for one sentences
the third dim is to store each label for one word
diff --git a/label_semantic_roles/image/bd_lstm_en.png b/label_semantic_roles/image/bd_lstm_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..c3646312e48db977402fb353dc0c9b4d02269bf4
Binary files /dev/null and b/label_semantic_roles/image/bd_lstm_en.png differ
diff --git a/label_semantic_roles/image/bidirectional_stacked_lstm_en.png b/label_semantic_roles/image/bidirectional_stacked_lstm_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..f0a195c24d9ee493f96bb93c28a99e70566be7a4
Binary files /dev/null and b/label_semantic_roles/image/bidirectional_stacked_lstm_en.png differ
diff --git a/label_semantic_roles/image/bio_example.png b/label_semantic_roles/image/bio_example.png
old mode 100644
new mode 100755
index 9ffebf26e6b5f879849e24061bfcc1a3b36d2f9d..e5f7151c9fcc50a7cf7af485cbbc7e4fccab0c20
Binary files a/label_semantic_roles/image/bio_example.png and b/label_semantic_roles/image/bio_example.png differ
diff --git a/label_semantic_roles/image/bio_example_en.png b/label_semantic_roles/image/bio_example_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..93b44dd4874402ef29ad7bd7d94147609b92e309
Binary files /dev/null and b/label_semantic_roles/image/bio_example_en.png differ
diff --git a/label_semantic_roles/image/dependency_parsing.png b/label_semantic_roles/image/dependency_parsing.png
old mode 100644
new mode 100755
index e54df49321d0607b0c3ae3300d38176a21f50d57..9265b671735940ed6549e2980064d2ce08baae64
Binary files a/label_semantic_roles/image/dependency_parsing.png and b/label_semantic_roles/image/dependency_parsing.png differ
diff --git a/label_semantic_roles/image/dependency_parsing_en.png b/label_semantic_roles/image/dependency_parsing_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..23f4f45b603e3d60702af2b2464d10fc8deed061
Binary files /dev/null and b/label_semantic_roles/image/dependency_parsing_en.png differ
diff --git a/label_semantic_roles/image/stacked_lstm_en.png b/label_semantic_roles/image/stacked_lstm_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..0b944ef91e8b5ba4b14d2a35bd8879f261cf8f61
Binary files /dev/null and b/label_semantic_roles/image/stacked_lstm_en.png differ
diff --git a/label_semantic_roles/index.en.html b/label_semantic_roles/index.en.html
index a3fc66370f6ac872d38f5faf798414dd86987e7e..2bc090b5bf507ce5fe74d31e0aeb2d2f3f5daf67 100644
--- a/label_semantic_roles/index.en.html
+++ b/label_semantic_roles/index.en.html
@@ -1,3 +1,4 @@
+
diff --git a/label_semantic_roles/index.html b/label_semantic_roles/index.html
index 044b3bd72dae72d996b93201b73bd1a9a9dd959c..7f93944e57a4a76e08a83b855cffe072d3dc55cd 100644
--- a/label_semantic_roles/index.html
+++ b/label_semantic_roles/index.html
@@ -1,3 +1,4 @@
+
diff --git a/label_semantic_roles/predict.sh b/label_semantic_roles/predict.sh
index 873aad670d16803ce321ab60baabe9fe29ea64bf..1fced67cf2c79b03a1f1a2e0d690e7bd8f6d32fc 100755
--- a/label_semantic_roles/predict.sh
+++ b/label_semantic_roles/predict.sh
@@ -19,7 +19,7 @@ function get_best_pass() {
cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \
sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
sort -n | head -n 1
-}
+}
log=train.log
LOG=`get_best_pass $log`
@@ -28,11 +28,11 @@ best_model_path="output/pass-${LOG[1]}"
config_file=db_lstm.py
dict_file=./data/wordDict.txt
-label_file=./data/targetDict.txt
+label_file=./data/targetDict.txt
predicate_dict_file=./data/verbDict.txt
input_file=./data/feature
output_file=predict.res
-
+
python predict.py \
-c $config_file \
-w $best_model_path \
diff --git a/machine_translation/README.en.md b/machine_translation/README.en.md
index 0e85dc0f8da18e68e21e28a8445ea827c1e1b1b8..8a7f1098182b6443b58c22360da2fcdfd3439444 100644
--- a/machine_translation/README.en.md
+++ b/machine_translation/README.en.md
@@ -9,19 +9,19 @@ Machine translation (MT) leverages computers to translate from one language to a
Early machine translation systems are mainly rule-based i.e. they rely on a language expert to specify the translation rules between the two languages. It is quite difficult to cover all the rules used in one languge. So it is quite a challenge for language experts to specify all possible rules in two or more different languages. Hence, a major challenge in conventional machine translation has been the difficulty in obtaining a complete rule set \[[1](#References)\]。
-To address the aforementioned problems, statistical machine translation techniques have been developed. These techniques learn the translation rules from a large corpus, instead of being designed by a language expert. While these techniques overcome the bottleneck of knowledge acquisition, there are still quite a lot of challenges, for example:
+To address the aforementioned problems, statistical machine translation techniques have been developed. These techniques learn the translation rules from a large corpus, instead of being designed by a language expert. While these techniques overcome the bottleneck of knowledge acquisition, there are still quite a lot of challenges, for example:
-1. human designed features cannot cover all possible linguistic variations;
+1. human designed features cannot cover all possible linguistic variations;
-2. it is difficult to use global features;
+2. it is difficult to use global features;
3. the techniques heavily rely on pre-processing techniques like word alignment, word segmentation and tokenization, rule-extraction and syntactic parsing etc. The error introduced in any of these steps could accumulate and impact translation quality.
-The recent development of deep learning provides new solutions to these challenges. The two main categories for deep learning based machine translation techniques are:
+The recent development of deep learning provides new solutions to these challenges. The two main categories for deep learning based machine translation techniques are:
-1. techniques based on the statistical machine translation system but with some key components improved with neural networks, e.g., language model, reordering model (please refer to the left part of Figure 1);
+1. techniques based on the statistical machine translation system but with some key components improved with neural networks, e.g., language model, reordering model (please refer to the left part of Figure 1);
2. techniques mapping from source language to target language directly using a neural network, or end-to-end neural machine translation (NMT).
@@ -57,7 +57,7 @@ This section will introduce Gated Recurrent Unit (GRU), Bi-directional Recurrent
We already introduced RNN and LSTM in the [Sentiment Analysis](https://github.com/PaddlePaddle/book/blob/develop/understand_sentiment/README.md) chapter.
Compared to a simple RNN, the LSTM added memory cell, input gate, forget gate and output gate. These gates combined with the memory cell greatly improve the ability to handle long-term dependencies.
-GRU\[[2](#References)\] proposed by Cho et al is a simplified LSTM and an extension of a simple RNN. It is shown in the figure below.
+GRU\[[2](#References)\] proposed by Cho et al is a simplified LSTM and an extension of a simple RNN. It is shown in the figure below.
A GRU unit has only two gates:
- reset gate: when this gate is closed, the history information is discarded, i.e., the irrelevant historical information has no effect on the future output.
- update gate: it combines the input gate and the forget gate and is used to control the impact of historical information on the hidden output. The historical information is passed over when the update gate is close to 1.
@@ -96,20 +96,20 @@ There are three steps for encoding a sentence:
1. One-hot vector representation of a word: Each word $x_i$ in the source sentence $x=\left \{ x_1,x_2,...,x_T \right \}$ is represented as a vector $w_i\epsilon R^{\left | V \right |},i=1,2,...,T$ where $w_i$ has the same dimensionality as the size of the dictionary, i.e., $\left | V \right |$, and has an element of one at the location corresponding to the location of the word in the dictionary and zero elsewhere.
-2. Word embedding as a representation in the low-dimensional semantic space: There are two problems with one-hot vector representation
+2. Word embedding as a representation in the low-dimensional semantic space: There are two problems with one-hot vector representation
- * the dimensionality of the vector is typically large, leading to the curse of dimensionality;
+ * the dimensionality of the vector is typically large, leading to the curse of dimensionality;
* it is hard to capture the relationships between words, i.e., semantic similarities. Therefore, it is useful to project the one-hot vector into a low-dimensional semantic space as a dense vector with fixed dimensions, i.e., $s_i=Cw_i$ for the $i$-th word, with $C\epsilon R^{K\times \left | V \right |}$ as the projection matrix and $K$ is the dimensionality of the word embedding vector.
3. Encoding of the source sequence via RNN: This can be described mathematically as:
$$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$$
-
- where
- $h_0$ is a zero vector,
- $\varnothing _\theta$ is a non-linear activation function, and
- $\mathbf{h}=\left \{ h_1,..., h_T \right \}$
+
+ where
+ $h_0$ is a zero vector,
+ $\varnothing _\theta$ is a non-linear activation function, and
+ $\mathbf{h}=\left \{ h_1,..., h_T \right \}$
is the sequential encoding of the first $T$ words from the source sequence. The vector representation of the whole sentence can be represented as the encoding vector at the last time step $T$ from $\mathbf{h}$, or by temporal pooling over $\mathbf{h}$.
@@ -142,8 +142,8 @@ The generation process of machine translation is to translate the source sentenc
### Attention Mechanism
-There are a few problems with the fixed dimensional vector representation from the encoding stage:
- * It is very challenging to encode both the semantic and syntactic information a sentence with a fixed dimensional vector regardless of the length of the sentence.
+There are a few problems with the fixed dimensional vector representation from the encoding stage:
+ * It is very challenging to encode both the semantic and syntactic information a sentence with a fixed dimensional vector regardless of the length of the sentence.
* Intuitively, when translating a sentence, we typically pay more attention to the parts in the source sentence more relevant to the current translation. Moreover, the focus changes along the process of the translation. With a fixed dimensional vector, all the information from the source sentence is treated equally in terms of attention. This is not reasonable. Therefore, Bahdanau et al. \[[4](#References)\] introduced attention mechanism, which can decode based on different fragments of the context sequence in order to address the difficulty of feature learning for long sentences. Decoder with attention will be explained in the following.
Different from the simple decoder, $z_i$ is computed as:
@@ -172,7 +172,7 @@ Figure 6. Decoder with Attention Mechanism
[Beam Search](http://en.wikipedia.org/wiki/Beam_search) is a heuristic search algorithm that explores a graph by expanding the most promising node in a limited set. It is typically used when the solution space is huge (e.g., for machine translation, speech recognition), and there is not enough memory for all the possible solutions. For example, if we want to translate “`你好`” into English, even if there are only three words in the dictionary (``, ``, `hello`), it is still possible to generate an infinite number of sentences, where the word `hello` can appear different number of times. Beam search could be used to find a good translation among them.
-Beam search builds a search tree using breadth first search and sorts the nodes according to a heuristic cost (sum of the log probability of the generated words) at each level of the tree. Only a fixed number of nodes according to the pre-specified beam size (or beam width) are considered. Thus, only nodes with highest scores are expanded in the next level. This reduces the space and time requirements significantly. However, a globally optimal solution is not guaranteed.
+Beam search builds a search tree using breadth first search and sorts the nodes according to a heuristic cost (sum of the log probability of the generated words) at each level of the tree. Only a fixed number of nodes according to the pre-specified beam size (or beam width) are considered. Thus, only nodes with highest scores are expanded in the next level. This reduces the space and time requirements significantly. However, a globally optimal solution is not guaranteed.
The goal is to maximize the probability of the generated sequence when using beam search in decoding, The procedure is as follows:
@@ -452,7 +452,7 @@ This tutorial will use the default SGD and Adam learning algorithm, with a learn
source_dict_dim = len(open(src_lang_dict, "r").readlines()) # size of the source language dictionary
target_dict_dim = len(open(trg_lang_dict, "r").readlines()) # size of target language dictionary
word_vector_dim = 512 # dimensionality of word vector
- encoder_size = 512 # dimensionality of the hidden state of encoder GRU
+ encoder_size = 512 # dimensionality of the hidden state of encoder GRU
decoder_size = 512 # dimentionality of the hidden state of decoder GRU
if is_generating:
diff --git a/machine_translation/README.md b/machine_translation/README.md
index 01cf3dc51e04a7c692dbe1935319258b806c65bd..3eec2b68c9bc3cc14e3544b1aac6c71c4265b2ff 100644
--- a/machine_translation/README.md
+++ b/machine_translation/README.md
@@ -93,7 +93,7 @@ GRU\[[2](#参考文献)\]是Cho等人在LSTM上提出的简化版本,也是RNN
机器翻译任务的训练过程中,解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是:
1. 每一个时刻,根据源语言句子的编码信息(又叫上下文向量,context vector)$c$、真实目标语言序列的第$i$个词$u_i$和$i$时刻RNN的隐层状态$z_i$,计算出下一个隐层状态$z_{i+1}$。计算公式如下:
-
+
$$z_{i+1}=\phi _{\theta '}\left ( c,u_i,z_i \right )$$
其中$\phi _{\theta '}$是一个非线性激活函数;$c=q\mathbf{h}$是源语言句子的上下文向量,在不使用[注意力机制](#注意力机制)时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义$c=h_T$;$u_i$是目标语言序列的第$i$个单词,$u_0$是目标语言序列的开始标记``,表示解码开始;$z_i$是$i$时刻解码RNN的隐层状态,$z_0$是一个全零的向量。
@@ -150,17 +150,19 @@ e_{ij}&=align(z_i,h_j)\\\\
注意:$z_{i+1}$和$p_{i+1}$的计算公式同[解码器](#解码器)中的一样。且由于生成时的每一步都是通过贪心法实现的,因此并不能保证得到全局最优解。
-## 数据准备
+## 数据介绍
### 下载与解压缩
本教程使用[WMT-14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/)数据集中的[bitexts(after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)作为训练集,[dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)作为测试集和生成集。
在Linux下,只需简单地运行以下命令:
+
```bash
cd data
./wmt14_data.sh
```
+
得到的数据集`data/wmt14`包含如下三个文件夹:
@@ -227,7 +227,7 @@ trainer = paddle.trainer.SGD(cost=cost,
Then we specify the training data `paddle.dataset.movielens.train()` and testing data `paddle.dataset.movielens.test()`. These two functions are *reader creators*, once called, returns a *reader*. A reader is a Python function, which, once called, returns a Python generator, which yields instances of data.
-Here `shuffle` is a reader decorator, which takes a reader A as its parameter, and returns a new reader B, where B calls A to read in `buffer_size` data instances everytime into a buffer, then shuffles and yield instances in the buffer. If you want very shuffled data, try use a larger buffer size.
+Here `shuffle` is a reader decorator, which takes a reader A as its parameter, and returns a new reader B, where B calls A to read in `buffer_size` data instances everytime into a buffer, then shuffles and yield instances in the buffer. If you want very shuffled data, try use a larger buffer size.
`batch` is a special decorator, whose input is a reader and output is a *batch reader*, which doesn't yield an instance at a time, but a minibatch.
diff --git a/recognize_digits/README.md b/recognize_digits/README.md
index 5970ff46835c0e3904a8491753df343e9e34a708..235ab91bddedc12623a7f642c1d526c426fe03d4 100644
--- a/recognize_digits/README.md
+++ b/recognize_digits/README.md
@@ -56,7 +56,7 @@ Softmax回归模型采用了最简单的两层神经网络,即只有输入层
1. 经过第一个隐藏层,可以得到 $ H_1 = \phi(W_1X + b_1) $,其中$\phi$代表激活函数,常见的有sigmoid、tanh或ReLU等函数。
2. 经过第二个隐藏层,可以得到 $ H_2 = \phi(W_2H_1 + b_2) $。
3. 最后,再经过输出层,得到的$Y=softmax(W_3H_2 + b_3)$,即为最后的分类结果向量。
-
+
图3为多层感知器的网络结构图,图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
diff --git a/recognize_digits/index.en.html b/recognize_digits/index.en.html
index bec542ca357adc52da20bcc6a9eba26a2c7d580f..4f6a1a920ed5f54dfd8d79c3999e121f52a2b436 100644
--- a/recognize_digits/index.en.html
+++ b/recognize_digits/index.en.html
@@ -1,3 +1,4 @@
+
-Figure 1. CNN for text modeling.
+Figure 1. CNN for text modeling.
-Assuming the length of the sentence is $n$, where the $i$-th word has embedding as $x_i\in\mathbb{R}^k$,where $k$ is the embedding dimensionality.
+Assuming the length of the sentence is $n$, where the $i$-th word has embedding as $x_i\in\mathbb{R}^k$,where $k$ is the embedding dimensionality.
First, we concatenate the words together: we piece every $h$ words as a window of length $h$: $x_{i:i+h-1}$. It refers to $x_{i},x_{i+1},\ldots,x_{i+h-1}$, where $i$ is the first word in the window, ranging from $1$ to $n-h+1$: $x_{i:i+h-1}\in\mathbb{R}^{hk}$.
@@ -60,7 +60,7 @@ RNN is an effective model for sequential data. Theoretical, the computational a
-Figure 2. An illustration of an unrolled RNN across “time”.
+Figure 2. An illustration of an unrolled RNN across “time”.
As shown in Figure 2, we unroll an RNN: at $t$-th time step, the network takes the $t$-th input vector and the latent state from last time-step $h_{t-1}$ as inputs and compute the latent state of current step. The whole process is repeated until all inputs are consumed. If we regard the RNN as a function $f$, it can be formulated as:
@@ -140,7 +140,7 @@ If it runs successfully, `./data/pre-imdb` will contain:
dict.txt labels.list test.list test_part_000 train.list train_part_000
```
-* test\_part\_000 和 train\_part\_000: all labeled training and testing set, and the training set is shuffled.
+* test\_part\_000 和 train\_part\_000: all labeled training and testing set, and the training set is shuffled.
* train.list and test.list: training and testing file-list (containing list of file names).
* dict.txt: dictionary generated from training set.
* labels.list: class label, 0 stands for negative while 1 for positive.
@@ -239,7 +239,7 @@ gradient_clipping_threshold=25)
### Model Structure
We use PaddlePaddle to implement two classification algorithms, based on above mentioned model [Text-CNN](#Text-CNN(CNN))和[Stacked-bidirectional LSTM](#Stacked-bidirectional LSTM(Stacked Bidirectional LSTM))。
-#### Implementation of Text CNN
+#### Implementation of Text CNN
```python
def convolution_net(input_dim,
class_dim=2,
@@ -477,7 +477,7 @@ predicting label is pos
`10007_10.txt` in folder`./data/aclImdb/test/pos`, the predicted label is also pos,so the prediction is correct.
## Summary
-In this chapter, we use sentiment analysis as an example to introduce applying deep learning models on end-to-end short text classification, as well as how to use PaddlePaddle to implement the model. Meanwhile, we briefly introduce two models for text processing: CNN and RNN. In following chapters we will see how these models can be applied in other tasks.
+In this chapter, we use sentiment analysis as an example to introduce applying deep learning models on end-to-end short text classification, as well as how to use PaddlePaddle to implement the model. Meanwhile, we briefly introduce two models for text processing: CNN and RNN. In following chapters we will see how these models can be applied in other tasks.
## Reference
1. Kim Y. [Convolutional neural networks for sentence classification](http://arxiv.org/pdf/1408.5882)[J]. arXiv preprint arXiv:1408.5882, 2014.
2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modelling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014.
diff --git a/understand_sentiment/README.md b/understand_sentiment/README.md
index ccce07fe7a758ac89cd7e0e69e8450e2cfbdfcb1..bedad89683d12c491f34cbd1ee71a0ad41f5e5f7 100644
--- a/understand_sentiment/README.md
+++ b/understand_sentiment/README.md
@@ -57,7 +57,7 @@ $$\hat c=max(c)$$
$$h_t=f(x_t,h_{t-1})=\sigma(W_{xh}x_t+W_{hh}h_{h-1}+b_h)$$
其中$W_{xh}$是输入到隐层的矩阵参数,$W_{hh}$是隐层到隐层的矩阵参数,$b_h$为隐层的偏置向量(bias)参数,$\sigma$为$sigmoid$函数。
-
+
在处理自然语言时,一般会先将词(one-hot表示)映射为其词向量(word embedding)表示,然后再作为循环神经网络每一时刻的输入$x_t$。此外,可以根据实际需要的不同在循环神经网络的隐层上连接其它层。如,可以把一个循环神经网络的隐层输出连接至下一个循环神经网络的输入构建深层(deep or stacked)循环神经网络,或者提取最后一个时刻的隐层状态作为句子表示进而使用分类模型等等。
### 长短期记忆网络(LSTM)
diff --git a/understand_sentiment/data/get_imdb.sh b/understand_sentiment/data/get_imdb.sh
index 7600af6fbb900ee845702f1297779c1f0ed9bf84..4542fc8313d2665cb5056177813225add54e1395 100755
--- a/understand_sentiment/data/get_imdb.sh
+++ b/understand_sentiment/data/get_imdb.sh
@@ -33,7 +33,7 @@ echo "Unzipping..."
tar -zxvf aclImdb_v1.tar.gz
unzip master.zip
-#move train and test set to imdb_data directory
+#move train and test set to imdb_data directory
#in order to process when traing
mkdir -p imdb/train
mkdir -p imdb/test
diff --git a/understand_sentiment/index.en.html b/understand_sentiment/index.en.html
index 43f14c10bcf4c77a71ea337940eb89e899daa916..d73b5c5b6a938f90551d25d2ac6841747fbb0393 100644
--- a/understand_sentiment/index.en.html
+++ b/understand_sentiment/index.en.html
@@ -1,3 +1,4 @@
+
diff --git a/understand_sentiment/index.html b/understand_sentiment/index.html
index aa2c6bc33d581db0a698f83bd704f7a4435fc853..252257d9c55a17a508978c327b738c40a4d69c59 100644
--- a/understand_sentiment/index.html
+++ b/understand_sentiment/index.html
@@ -1,3 +1,4 @@
+
diff --git a/understand_sentiment/preprocess.py b/understand_sentiment/preprocess.py
index 29b3682b747c66574590de5ea70574981cc536bb..cb4438ba1738456ef56a0a4656dcc0dac1b12384 100755
--- a/understand_sentiment/preprocess.py
+++ b/understand_sentiment/preprocess.py
@@ -24,7 +24,7 @@ from optparse import OptionParser
from paddle.utils.preprocess_util import *
"""
Usage: run following command to show help message.
- python preprocess.py -h
+ python preprocess.py -h
"""
diff --git a/word2vec/README.en.md b/word2vec/README.en.md
index 654025329250e614d72a110673b4f9054a4f68ce..bb55cf92d8202e7879fe25ac4abb6af7ddbd0772 100644
--- a/word2vec/README.en.md
+++ b/word2vec/README.en.md
@@ -36,8 +36,8 @@ The neural network based model does not require storing huge hash tables of stat
In this section, after training the word embedding model, we could use the data visualization algorithm $t-$SNE\[[4](#reference)\] to draw the word embedding vectors after projecting them onto a two-dimensional space (see figure below). From the figure we could see that the semantically relevant words -- *a*, *the*, and *these* or *big* and *huge* -- are close to each other in the projected space, while irrelevant words -- *say* and *business* or *decision* and *japan* -- are far from each other.
-
- Figure 1. Two dimension projection of word embeddings
+
+ Figure 1. Two dimension projection of word embeddings
### Cosine Similarity
@@ -70,16 +70,16 @@ Before diving into word embedding models, we will first introduce the concept of
In general, models that generate the probability of a sequence can be applied to many fields, like machine translation, speech recognition, information retrieval, part-of-speech tagging, and handwriting recognition. Take information retrieval, for example. If you were to search for "how long is a football bame" (where bame is a medical noun), the search engine would have asked if you had meant "how long is a football game" instead. This is because the probability of "how long is a football bame" is very low according to the language model; in addition, among all of the words easily confused with "bame", "game" would build the most probable sentence.
#### Target Probability
-For language model's target probability $P(w_1, ..., w_T)$, if the words in the sentence were to be independent, the joint probability of the whole sentence would be the product of each word's probability:
+For language model's target probability $P(w_1, ..., w_T)$, if the words in the sentence were to be independent, the joint probability of the whole sentence would be the product of each word's probability:
$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t)$$
However, the frequency of words in a sentence typically relates to the words before them, so canonical language models are constructed using conditional probability in its target probability:
-$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t | w_1, ... , w_{t-1})$$
+$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t | w_1, ... , w_{t-1})$$
-### N-gram neural model
+### N-gram neural model
In computational linguistics, n-gram is an important method to represent text. An n-gram represents a contiguous sequence of n consecutive items given a text. Based on the desired application scenario, each item could be a letter, a syllable or a word. The N-gram model is also an important method in statistical language modeling. When training language models with n-grams, the first (n-1) words of an n-gram are used to predict the *n*th word.
@@ -89,47 +89,47 @@ We have previously described language model using conditional probability, where
$$P(w_1, ..., w_T) = \prod_{t=n}^TP(w_t|w_{t-1}, w_{t-2}, ..., w_{t-n+1})$$
-Given some real corpus in which all sentences are meaningful, the n-gram model should maximize the following objective function:
+Given some real corpus in which all sentences are meaningful, the n-gram model should maximize the following objective function:
$$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
-where $f(w_t, w_{t-1}, ..., w_{t-n+1})$ represents the conditional probability of the current word $w_t$ given its previous $n-1$ words, and $R(\theta)$ represents parameter regularization term.
+where $f(w_t, w_{t-1}, ..., w_{t-n+1})$ represents the conditional probability of the current word $w_t$ given its previous $n-1$ words, and $R(\theta)$ represents parameter regularization term.
-
-
- Figure 2. N-gram neural network model
+
+
+ Figure 2. N-gram neural network model
Figure 2 shows the N-gram neural network model. From the bottom up, the model has the following components:
- For each sample, the model gets input $w_{t-n+1},...w_{t-1}$, and outputs the probability that the t-th word is one of `|V|` in the dictionary.
-
- Every input word $w_{t-n+1},...w_{t-1}$ first gets transformed into word embedding $C(w_{t-n+1}),...C(w_{t-1})$ through a transformation matrix.
-
+
+ Every input word $w_{t-n+1},...w_{t-1}$ first gets transformed into word embedding $C(w_{t-n+1}),...C(w_{t-1})$ through a transformation matrix.
+
- All the word embeddings concatenate into a single vector, which is mapped (nonlinearly) into the $t$-th word hidden representation:
- $$g=Utanh(\theta^Tx + b_1) + Wx + b_2$$
-
- where $x$ is the large vector concatenated from all the word embeddings representing the context; $\theta$, $U$, $b_1$, $b_2$ and $W$ are parameters connecting word embedding layers to the hidden layers. $g$ represents the unnormalized probability of the output word, $g_i$ represents the unnormalized probability of the output word being the i-th word in the dictionary.
+ $$g=Utanh(\theta^Tx + b_1) + Wx + b_2$$
+
+ where $x$ is the large vector concatenated from all the word embeddings representing the context; $\theta$, $U$, $b_1$, $b_2$ and $W$ are parameters connecting word embedding layers to the hidden layers. $g$ represents the unnormalized probability of the output word, $g_i$ represents the unnormalized probability of the output word being the i-th word in the dictionary.
- Based on the definition of softmax, using normalized $g_i$, the probability that the output word is $w_t$ is represented as:
-
+
$$P(w_t | w_1, ..., w_{t-n+1}) = \frac{e^{g_{w_t}}}{\sum_i^{|V|} e^{g_i}}$$
-
+
- The cost of the entire network is a multi-class cross-entropy and can be described by the following loss function
- $$J(\theta) = -\sum_{i=1}^N\sum_{c=1}^{|V|}y_k^{i}log(softmax(g_k^i))$$
+ $$J(\theta) = -\sum_{i=1}^N\sum_{c=1}^{|V|}y_k^{i}log(softmax(g_k^i))$$
where $y_k^i$ represents the true label for the $k$-th class in the $i$-th sample ($0$ or $1$), $softmax(g_k^i)$ represents the softmax probability for the $k$-th class in the $i$-th sample.
-### Continuous Bag-of-Words model(CBOW)
+### Continuous Bag-of-Words model(CBOW)
CBOW model predicts the current word based on the N words both before and after it. When $N=2$, the model is as the figure below:
-
-
- Figure 3. CBOW model
+
+
+ Figure 3. CBOW model
Specifically, by ignoring the order of words in the sequence, CBOW uses the average value of the word embedding of the context to predict the current word:
@@ -138,30 +138,30 @@ $$\text{context} = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$
where $x_t$ is the word embedding of the t-th word, classification score vector is $z=U*\text{context}$, the final classification $y$ uses softmax and the loss function uses multi-class cross-entropy.
-### Skip-gram model
+### Skip-gram model
-The advantages of CBOW is that it smooths over the word embeddings of the context and reduces noise, so it is very effective on small dataset. Skip-gram uses a word to predict its context and get multiple context for the given word, so it can be used in larger datasets.
+The advantages of CBOW is that it smooths over the word embeddings of the context and reduces noise, so it is very effective on small dataset. Skip-gram uses a word to predict its context and get multiple context for the given word, so it can be used in larger datasets.
-
-
- Figure 4. Skip-gram model
+
+
+ Figure 4. Skip-gram model
-As illustrated in the figure above, skip-gram model maps the word embedding of the given word onto $2n$ word embeddings (including $n$ words before and $n$ words after the given word), and then combine the classification loss of all those $2n$ words by softmax.
+As illustrated in the figure above, skip-gram model maps the word embedding of the given word onto $2n$ word embeddings (including $n$ words before and $n$ words after the given word), and then combine the classification loss of all those $2n$ words by softmax.
## Data Preparation
## Model Configuration
-
-
- Figure 5. N-gram neural network model in model configuration
+
+
+ Figure 5. N-gram neural network model in model configuration
-
+
## Model Training
## Model Application
-
+
## Conclusion
This chapter introduces word embedding, the relationship between language model and word embedding, and how to train neural networks to learn word embedding.
diff --git a/word2vec/README.md b/word2vec/README.md
index 1a942f4ec26cf2763977a57b2b8e9232c95f521e..414c7839d8b81834a75ae1d51db840804edaa77c 100644
--- a/word2vec/README.md
+++ b/word2vec/README.md
@@ -1,3 +1,4 @@
+
# 词向量
本教程源代码目录在[book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/word2vec), 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
@@ -6,7 +7,7 @@
本章我们介绍词的向量表征,也称为word embedding。词向量是自然语言处理中常见的一个操作,是搜索引擎、广告系统、推荐系统等互联网服务背后常见的基础技术。
-在这些互联网服务里,我们经常要比较两个词或者两段文本之间的相关性。为了做这样的比较,我们往往先要把词表示成计算机适合处理的方式。最自然的方式恐怕莫过于向量空间模型(vector space model)。
+在这些互联网服务里,我们经常要比较两个词或者两段文本之间的相关性。为了做这样的比较,我们往往先要把词表示成计算机适合处理的方式。最自然的方式恐怕莫过于向量空间模型(vector space model)。
在这种方式里,每个词被表示成一个实数向量(one-hot vector),其长度为字典大小,每个维度对应一个字典里的每个词,除了这个词对应维度上的值是1,其他元素都是0。
One-hot vector虽然自然,但是用处有限。比如,在互联网广告系统里,如果用户输入的query是“母亲节”,而有一个广告的关键词是“康乃馨”。虽然按照常理,我们知道这两个词之间是有联系的——母亲节通常应该送给母亲一束康乃馨;但是这两个词对应的one-hot vectors之间的距离度量,无论是欧氏距离还是余弦相似度(cosine similarity),由于其向量正交,都认为这两个词毫无相关性。 得出这种与我们相悖的结论的根本原因是:每个词本身的信息量都太小。所以,仅仅给定两个词,不足以让我们准确判别它们是否相关。要想精确计算相关性,我们还需要更多的信息——从大量数据里通过机器学习方法归纳出来的知识。
@@ -68,7 +69,7 @@ $$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t | w_1, ... , w_{t-1})$$
-### N-gram neural model
+### N-gram neural model
在计算语言学中,n-gram是一种重要的文本表示方法,表示一个文本中连续的n个项。基于具体的应用场景,每一项可以是一个字母、单词或者音节。 n-gram模型也是统计语言模型中的一种重要方法,用n-gram训练语言模型时,一般用每个n-gram的历史n-1个词语组成的内容来预测第n个词。
@@ -84,39 +85,39 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
其中$f(w_t, w_{t-1}, ..., w_{t-n+1})$表示根据历史n-1个词得到当前词$w_t$的条件概率,$R(\theta)$表示参数正则项。
-
-
+
### 数据预处理
本章训练的是5-gram模型,表示在PaddlePaddle训练时,每条数据的前4个词用来预测第5个词。PaddlePaddle提供了对应PTB数据集的python包`paddle.dataset.imikolov`,自动做数据的下载与预处理,方便大家使用。
@@ -186,7 +187,7 @@ dream that one day
本配置的模型结构如下图所示:
-
+
图5. 模型配置中的N-gram神经网络模型
@@ -208,8 +209,8 @@ N = 5 # 训练5-Gram
接着,定义网络结构:
- 将$w_t$之前的$n-1$个词 $w_{t-n+1},...w_{t-1}$,通过$|V|\times D$的矩阵映射到D维词向量(本例中取D=32)。
-
-```python
+
+```python
def wordemb(inlayer):
wordemb = paddle.layer.table_projection(
input=inlayer,
@@ -225,54 +226,54 @@ def wordemb(inlayer):
- 定义输入层接受的数据类型以及名字。
```python
-def main():
- paddle.init(use_gpu=False, trainer_count=1) # 初始化PaddlePaddle
- word_dict = paddle.dataset.imikolov.build_dict()
- dict_size = len(word_dict)
- # 每个输入层都接受整形数据,这些数据的范围是[0, dict_size)
- firstword = paddle.layer.data(
- name="firstw", type=paddle.data_type.integer_value(dict_size))
- secondword = paddle.layer.data(
- name="secondw", type=paddle.data_type.integer_value(dict_size))
- thirdword = paddle.layer.data(
- name="thirdw", type=paddle.data_type.integer_value(dict_size))
- fourthword = paddle.layer.data(
- name="fourthw", type=paddle.data_type.integer_value(dict_size))
- nextword = paddle.layer.data(
- name="fifthw", type=paddle.data_type.integer_value(dict_size))
-
- Efirst = wordemb(firstword)
- Esecond = wordemb(secondword)
- Ethird = wordemb(thirdword)
- Efourth = wordemb(fourthword)
+paddle.init(use_gpu=False, trainer_count=3) # 初始化PaddlePaddle
+word_dict = paddle.dataset.imikolov.build_dict()
+dict_size = len(word_dict)
+# 每个输入层都接受整形数据,这些数据的范围是[0, dict_size)
+firstword = paddle.layer.data(
+ name="firstw", type=paddle.data_type.integer_value(dict_size))
+secondword = paddle.layer.data(
+ name="secondw", type=paddle.data_type.integer_value(dict_size))
+thirdword = paddle.layer.data(
+ name="thirdw", type=paddle.data_type.integer_value(dict_size))
+fourthword = paddle.layer.data(
+ name="fourthw", type=paddle.data_type.integer_value(dict_size))
+nextword = paddle.layer.data(
+ name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+Efirst = wordemb(firstword)
+Esecond = wordemb(secondword)
+Ethird = wordemb(thirdword)
+Efourth = wordemb(fourthword)
+
```
- 将这n-1个词向量经过concat_layer连接成一个大向量作为历史文本特征。
```python
- contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
```
- 将历史文本特征经过一个全连接得到文本隐层特征。
```python
- hidden1 = paddle.layer.fc(input=contextemb,
- size=hiddensize,
- act=paddle.activation.Sigmoid(),
- layer_attr=paddle.attr.Extra(drop_rate=0.5),
- bias_attr=paddle.attr.Param(learning_rate=2),
- param_attr=paddle.attr.Param(
- initial_std=1. / math.sqrt(embsize * 8),
- learning_rate=1))
+hidden1 = paddle.layer.fc(input=contextemb,
+ size=hiddensize,
+ act=paddle.activation.Sigmoid(),
+ layer_attr=paddle.attr.Extra(drop_rate=0.5),
+ bias_attr=paddle.attr.Param(learning_rate=2),
+ param_attr=paddle.attr.Param(
+ initial_std=1. / math.sqrt(embsize * 8),
+ learning_rate=1))
```
-
+
- 将文本隐层特征,再经过一个全连接,映射成一个$|V|$维向量,同时通过softmax归一化得到这`|V|`个词的生成概率。
```python
- predictword = paddle.layer.fc(input=hidden1,
- size=dict_size,
- bias_attr=paddle.attr.Param(learning_rate=2),
- act=paddle.activation.Softmax())
+predictword = paddle.layer.fc(input=hidden1,
+ size=dict_size,
+ bias_attr=paddle.attr.Param(learning_rate=2),
+ act=paddle.activation.Softmax())
```
- 网络的损失函数为多分类交叉熵,可直接调用`classification_cost`函数。
@@ -288,11 +289,11 @@ cost = paddle.layer.classification_cost(input=predictword, label=nextword)
- 正则化(regularization): 是防止网络过拟合的一种手段,此处采用L2正则化。
```python
- parameters = paddle.parameters.create(cost)
- adam_optimizer = paddle.optimizer.Adam(
- learning_rate=3e-3,
- regularization=paddle.optimizer.L2Regularization(8e-4))
- trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
+parameters = paddle.parameters.create(cost)
+adam_optimizer = paddle.optimizer.Adam(
+ learning_rate=3e-3,
+ regularization=paddle.optimizer.L2Regularization(8e-4))
+trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
```
下一步,我们开始训练过程。`paddle.dataset.imikolov.train()`和`paddle.dataset.imikolov.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数,每次调用的时候返回一个Python generator。
@@ -300,113 +301,95 @@ cost = paddle.layer.classification_cost(input=predictword, label=nextword)
`paddle.batch`的输入是一个reader,输出是一个batched reader —— 在PaddlePaddle里,一个reader每次yield一条训练数据,而一个batched reader每次yield一个minbatch。
```python
- def event_handler(event):
- if isinstance(event, paddle.event.EndIteration):
- if event.batch_id % 100 == 0:
- result = trainer.test(
+import gzip
+
+def event_handler(event):
+ if isinstance(event, paddle.event.EndIteration):
+ if event.batch_id % 100 == 0:
+ print "Pass %d, Batch %d, Cost %f, %s" % (
+ event.pass_id, event.batch_id, event.cost, event.metrics)
+
+ if isinstance(event, paddle.event.EndPass):
+ result = trainer.test(
paddle.batch(
paddle.dataset.imikolov.test(word_dict, N), 32))
- print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
- event.pass_id, event.batch_id, event.cost, event.metrics,
- result.metrics)
-
- trainer.train(
- paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
- num_passes=30,
- event_handler=event_handler)
+ print "Pass %d, Testing metrics %s" % (event.pass_id, result.metrics)
+ with gzip.open("model_%d.tar.gz"%event.pass_id, 'w') as f:
+ parameters.to_tar(f)
+
+trainer.train(
+ paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+ num_passes=100,
+ event_handler=event_handler)
```
-训练过程是完全自动的,event_handler里打印的日志类似如下所示:
+ ...
+ Pass 0, Batch 25000, Cost 4.251861, {'classification_error_evaluator': 0.84375}
+ Pass 0, Batch 25100, Cost 4.847692, {'classification_error_evaluator': 0.8125}
+ Pass 0, Testing metrics {'classification_error_evaluator': 0.7417652606964111}
+
+
+训练过程是完全自动的,event_handler里打印的日志类似如上所示:
-```text
-.............................
-I1222 09:27:16.477841 12590 TrainerInternal.cpp:162] Batch=3000 samples=300000 AvgCost=5.36135 CurrentCost=5.36135 Eval: classification_error_evaluator=0.818653 CurrentEval: class
-ification_error_evaluator=0.818653
-.............................
-I1222 09:27:22.416700 12590 TrainerInternal.cpp:162] Batch=6000 samples=600000 AvgCost=5.29301 CurrentCost=5.22467 Eval: classification_error_evaluator=0.814542 CurrentEval: class
-ification_error_evaluator=0.81043
-.............................
-I1222 09:27:28.343756 12590 TrainerInternal.cpp:162] Batch=9000 samples=900000 AvgCost=5.22494 CurrentCost=5.08876 Eval: classification_error_evaluator=0.810088 CurrentEval: class
-ification_error_evaluator=0.80118
-..I1222 09:27:29.128582 12590 TrainerInternal.cpp:179] Pass=0 Batch=9296 samples=929600 AvgCost=5.21786 Eval: classification_error_evaluator=0.809647
-I1222 09:27:29.627616 12590 Tester.cpp:111] Test samples=73760 cost=4.9594 Eval: classification_error_evaluator=0.79676
-I1222 09:27:29.627713 12590 GradientMachine.cpp:112] Saving parameters to model/pass-00000
-```
经过30个pass,我们将得到平均错误率为classification_error_evaluator=0.735611。
## 应用模型
-训练模型后,我们可以加载模型参数,用训练出来的词向量初始化其他模型,也可以将模型参数从二进制格式转换成文本格式进行后续应用。
+训练模型后,我们可以加载模型参数,用训练出来的词向量初始化其他模型,也可以将模型查看参数用来做后续应用。
-### 初始化其他模型
-训练好的模型参数可以用来初始化其他模型。具体方法如下:
-在PaddlePaddle 训练命令行中,用`--init_model_path` 来定义初始化模型的位置,用`--load_missing_parameter_strategy`指定除了词向量以外的新模型其他参数的初始化策略。注意,新模型需要和原模型共享被初始化参数的参数名。
-
### 查看词向量
-PaddlePaddle训练出来的参数为二进制格式,存储在对应训练pass的文件夹下。这里我们提供了文件`format_convert.py`用来互转PaddlePaddle训练结果的二进制文件和文本格式特征文件。
-```bash
-python format_convert.py --b2t -i INPUT -o OUTPUT -d DIM
-```
-其中,INPUT是输入的(二进制)词向量模型名称,OUTPUT是输出的文本模型名称,DIM是词向量参数维度。
+PaddlePaddle训练出来的参数可以直接使用`parameters.get()`获取出来。例如查看单词的word的词向量,即为
-用法如:
-```bash
-python format_convert.py --b2t -i model/pass-00029/_proj -o model/pass-00029/_proj.txt -d 32
-```
-转换后得到的文本文件如下:
+```python
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
-```text
-0,4,62496
--0.7444070,-0.1846171,-1.5771370,0.7070392,2.1963732,-0.0091410, ......
--0.0721337,-0.2429973,-0.0606297,0.1882059,-0.2072131,-0.7661019, ......
-......
+print embeddings[word_dict['word']]
```
-其中,第一行是PaddlePaddle 输出文件的格式说明,包含3个属性:
-1) PaddlePaddle的版本号,本例中为0;
-2) 浮点数占用的字节数,本例中为4;
-3) 总计的参数个数, 本例中为62496(即1953*32);
-第二行及之后的每一行都按顺序表示字典里一个词的特征,用逗号分隔。
-
-### 修改词向量
+ [-0.38961065 -0.02392169 -0.00093231 0.36301503 0.13538605 0.16076435
+ -0.0678709 0.1090285 0.42014077 -0.24119169 -0.31847557 0.20410083
+ 0.04910378 0.19021918 -0.0122014 -0.04099389 -0.16924137 0.1911236
+ -0.10917275 0.13068172 -0.23079982 0.42699069 -0.27679482 -0.01472992
+ 0.2069038 0.09005053 -0.3282454 0.12717034 -0.24218646 0.25304323
+ 0.19072419 -0.24286366]
-我们可以对词向量进行修改,并转换成PaddlePaddle参数二进制格式,方法:
-```bash
-python format_convert.py --t2b -i INPUT -o OUTPUT
-```
-其中,INPUT是输入的输入的文本词向量模型名称,OUTPUT是输出的二进制词向量模型名称
+### 修改词向量
+
+获得到的embedding为一个标准的numpy矩阵。我们可以对这个numpy矩阵进行修改,然后赋值回去。
-输入的文本格式如下(注意,不包含上面二进制转文本后第一行的格式说明):
-```text
--0.7444070,-0.1846171,-1.5771370,0.7070392,2.1963732,-0.0091410, ......
--0.0721337,-0.2429973,-0.0606297,0.1882059,-0.2072131,-0.7661019, ......
-......
+```python
+def modify_embedding(emb):
+ # Add your modification here.
+ pass
+
+modify_embedding(embeddings)
+parameters.set("_proj", embeddings)
```
-
-
### 计算词语之间的余弦距离
两个向量之间的距离可以用余弦值来表示,余弦值在$[-1,1]$的区间内,向量间余弦值越大,其距离越近。这里我们在`calculate_dis.py`中实现不同词语的距离度量。
用法如下:
-```bash
-python calculate_dis.py VOCABULARY EMBEDDINGLAYER`
-```
-其中,`VOCABULARY`是字典,`EMBEDDINGLAYER`是词向量模型,示例如下:
+```python
+from scipy import spatial
+
+emb_1 = embeddings[word_dict['world']]
+emb_2 = embeddings[word_dict['would']]
-```bash
-python calculate_dis.py data/vocabulary.txt model/pass-00029/_proj.txt
+print spatial.distance.cosine(emb_1, emb_2)
```
-
-
+
+ 0.99375076448
+
+
## 总结
本章中,我们介绍了词向量、语言模型和词向量的关系、以及如何通过训练神经网络模型获得词向量。在信息检索中,我们可以根据向量间的余弦夹角,来判断query和文档关键词这二者间的相关性。在句法分析和语义分析中,训练好的词向量可以用来初始化模型,以得到更好的效果。在文档分类中,有了词向量之后,可以用聚类的方法将文档中同义词进行分组。希望大家在本章后能够自行运用词向量进行相关领域的研究。
diff --git a/word2vec/format_convert.py b/word2vec/format_convert.py
index f12ad81c0aa0d532d6f337d41479228f5b04ebc9..ddbff62a942d0249bbca49aabd07ef3276b2d15c 100755
--- a/word2vec/format_convert.py
+++ b/word2vec/format_convert.py
@@ -30,25 +30,25 @@ import struct
def binary2text(input, output, paraDim):
"""
- Convert a binary parameter file of embedding model to be a text file.
+ Convert a binary parameter file of embedding model to be a text file.
input: the name of input binary parameter file, the format is:
1) the first 16 bytes is filehead:
version(4 bytes): version of paddle, default = 0
floatSize(4 bytes): sizeof(float) = 4
paraCount(8 bytes): total number of parameter
- 2) the next (paraCount * 4) bytes is parameters, each has 4 bytes
+ 2) the next (paraCount * 4) bytes is parameters, each has 4 bytes
output: the name of output text parameter file, for example:
0,4,32156096
-0.7845433,1.1937413,-0.1704215,...
0.0000909,0.0009465,-0.0008813,...
...
the format is:
- 1) the first line is filehead:
+ 1) the first line is filehead:
version=0, floatSize=4, paraCount=32156096
2) other lines print the paramters
a) each line prints paraDim paramters splitted by ','
b) there is paraCount/paraDim lines (embedding words)
- paraDim: dimension of parameters
+ paraDim: dimension of parameters
"""
fi = open(input, "rb")
fo = open(output, "w")
@@ -78,7 +78,7 @@ def binary2text(input, output, paraDim):
def get_para_count(input):
"""
- Compute the total number of embedding parameters in input text file.
+ Compute the total number of embedding parameters in input text file.
input: the name of input text file
"""
numRows = 1
@@ -96,14 +96,14 @@ def text2binary(input, output, paddle_head=True):
Convert a text parameter file of embedding model to be a binary file.
input: the name of input text parameter file, for example:
-0.7845433,1.1937413,-0.1704215,...
- 0.0000909,0.0009465,-0.0008813,...
+ 0.0000909,0.0009465,-0.0008813,...
...
the format is:
1) it doesn't have filehead
- 2) each line stores the same dimension of parameters,
+ 2) each line stores the same dimension of parameters,
the separator is commas ','
output: the name of output binary parameter file, the format is:
- 1) the first 16 bytes is filehead:
+ 1) the first 16 bytes is filehead:
version(4 bytes), floatSize(4 bytes), paraCount(8 bytes)
2) the next (paraCount * 4) bytes is parameters, each has 4 bytes
"""
@@ -127,7 +127,7 @@ def text2binary(input, output, paddle_head=True):
def main():
"""
- Main entry for running format_convert.py
+ Main entry for running format_convert.py
"""
usage = "usage: \n" \
"python %prog --b2t -i INPUT -o OUTPUT -d DIM \n" \
diff --git a/word2vec/index.en.html b/word2vec/index.en.html
index f20c2b3122645cfc42d0248c11a74df18e1a7d1e..7b041212d24853f49b8ef6cb1e4182cc5ffc8236 100644
--- a/word2vec/index.en.html
+++ b/word2vec/index.en.html
@@ -1,3 +1,4 @@
+
diff --git a/word2vec/index.html b/word2vec/index.html
index dbc9c53bef2f5614dcad34a4f22596da5f47b0e9..e7b232121b0b1d2c9f47720602c05e171597497a 100644
--- a/word2vec/index.html
+++ b/word2vec/index.html
@@ -1,3 +1,4 @@
+