From f2eaaad2657673f2915a6fd024f5e9b5624c42a4 Mon Sep 17 00:00:00 2001 From: Aston Zhang Date: Sun, 30 Sep 2018 14:34:02 +0000 Subject: [PATCH] revise gluonbook utils --- chapter_appendix/gluonbook.md | 15 +- chapter_natural-language-processing/index.md | 2 +- .../sentiment-analysis-rnn.md | 219 ++++++++++++++++++ .../sentiment-analysis.md | 2 +- chapter_recurrent-neural-networks/deep-rnn.md | 2 +- .../rnn-scratch.md | 4 +- gluonbook/utils.py | 23 +- 7 files changed, 249 insertions(+), 18 deletions(-) create mode 100644 chapter_natural-language-processing/sentiment-analysis-rnn.md diff --git a/chapter_appendix/gluonbook.md b/chapter_appendix/gluonbook.md index 65a8c1c..d14b0f1 100644 --- a/chapter_appendix/gluonbook.md +++ b/chapter_appendix/gluonbook.md @@ -6,16 +6,17 @@ | `bbox_to_rect`|[物体检测和边界框](../chapter_computer-vision/bounding-box.md)| | `Benchmark`|[异步计算](../chapter_computational-performance/async-computation.md)| | `corr2d`|[二维卷积层](../chapter_convolutional-neural-networks/conv-layer.md)| -| `count_tokens`|[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis.md)| +| `count_tokens`|[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis-rnn.md)| | `data_iter`|[线性回归的从零开始实现](../chapter_deep-learning-basics/linear-regression-scratch.md)| | `data_iter_consecutive`|[语言模型数据集(周杰伦专辑歌词)](../chapter_recurrent-neural-networks/lang-model-dataset.md)| | `data_iter_random`|[语言模型数据集(周杰伦专辑歌词)](../chapter_recurrent-neural-networks/lang-model-dataset.md)| -| `download_imdb`|[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis.md)| +| `download_imdb`|[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis-rnn.md)| | `download_voc_pascal`|[语义分割和数据集](../chapter_computer-vision/semantic-segmentation-and-dataset.md)| | `evaluate_accuracy`|[图像增广](../chapter_computer-vision/image-augmentation.md)| | `get_data_ch7`|[小批量随机梯度下降](../chapter_optimization/minibatch-sgd.md)| | `get_fashion_mnist_labels`|[图像分类数据集(Fashion-MNIST)](../chapter_deep-learning-basics/fashion-mnist.md)| -| `get_tokenized_imdb`|[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis.md)| +| `get_tokenized_imdb`|[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis-rnn.md)| +| `get_vocab_imdb`|[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis-rnn.md)| | `grad_clipping`|[循环神经网络的从零开始实现](../chapter_recurrent-neural-networks/rnn-scratch.md)| | `linreg`|[线性回归的从零开始实现](../chapter_deep-learning-basics/linear-regression-scratch.md)| | `load_data_fashion_mnist`|[深度卷积神经网络(AlexNet)](../chapter_convolutional-neural-networks/alexnet.md)| @@ -24,9 +25,9 @@ | `plt`|[线性回归的从零开始实现](../chapter_deep-learning-basics/linear-regression-scratch.md)| | `predict_rnn`|[循环神经网络的从零开始实现](../chapter_recurrent-neural-networks/rnn-scratch.md)| | `predict_rnn_gluon`|[循环神经网络的Gluon实现](../chapter_recurrent-neural-networks/rnn-gluon.md)| -| `predict_sentiment`|[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis.md)| -| `preprocess_imdb`|[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis.md)| -| `read_imdb`|[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis.md)| +| `predict_sentiment`|[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis-rnn.md)| +| `preprocess_imdb`|[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis-rnn.md)| +| `read_imdb`|[文本情感分类:使用循环神经网络](../chapter_natural-language-processing/sentiment-analysis-rnn.md)| | `read_voc_images`|[语义分割和数据集](../chapter_computer-vision/semantic-segmentation-and-dataset.md)| | `Residual`|[残差网络(ResNet)](../chapter_convolutional-neural-networks/resnet.md)| | `resnet18`|[多GPU计算的Gluon实现](../chapter_computational-performance/multiple-gpus-gluon.md)| @@ -36,8 +37,8 @@ | `sgd`|[线性回归的从零开始实现](../chapter_deep-learning-basics/linear-regression-scratch.md)| | `show_bboxes`|[锚框](../chapter_computer-vision/anchor.md)| | `show_fashion_mnist`|[图像分类数据集(Fashion-MNIST)](../chapter_deep-learning-basics/fashion-mnist.md)| -| `show_trace_2d`|[梯度下降和随机梯度下降](../chapter_optimization/gd-sgd.md)| | `show_images`|[图像增广](../chapter_computer-vision/image-augmentation.md)| +| `show_trace_2d`|[梯度下降和随机梯度下降](../chapter_optimization/gd-sgd.md)| | `squared_loss`|[线性回归的从零开始实现](../chapter_deep-learning-basics/linear-regression-scratch.md)| | `to_onehot`|[循环神经网络的从零开始实现](../chapter_recurrent-neural-networks/rnn-scratch.md)| | `train`|[图像增广](../chapter_computer-vision/image-augmentation.md)| diff --git a/chapter_natural-language-processing/index.md b/chapter_natural-language-processing/index.md index a785119..1a1dd12 100644 --- a/chapter_natural-language-processing/index.md +++ b/chapter_natural-language-processing/index.md @@ -15,7 +15,7 @@ fasttext glove similarity-analogy - sentiment-analysis + sentiment-analysis-rnn sentiment-analysis-cnn seq2seq beam-search diff --git a/chapter_natural-language-processing/sentiment-analysis-rnn.md b/chapter_natural-language-processing/sentiment-analysis-rnn.md new file mode 100644 index 0000000..c8627b3 --- /dev/null +++ b/chapter_natural-language-processing/sentiment-analysis-rnn.md @@ -0,0 +1,219 @@ +# 文本情感分类:使用循环神经网络 + +文本分类是自然语言处理的一个常见任务,它把一段不定长的文本序列变换为文本的类别。本节关注它的一个子问题:使用文本情感分类(情感分析)来分析文本用户的情绪。这个问题有广泛的应用,例如分析用户对产品的评论可以统计用户的满意度,分析用户对市场行情的情绪能帮助预测接下来的行情。 + +本节我们将应用预训练的词向量和含多个隐藏层的双向循环神经网络来判断一段不定长的文本序列中包含的是正面还是负面的情绪。在实验开始前,导入所需的包或模块。 + +```{.python .input n=2} +import sys +sys.path.insert(0, '..') + +import collections +import gluonbook as gb +from mxnet import gluon, init, nd +from mxnet.contrib import text +from mxnet.gluon import data as gdata, loss as gloss, nn, rnn, utils as gutils +import os +import random +import tarfile +``` + +## 文本情感分类数据 + +我们使用Stanford's Large Movie Review Dataset作为文本情感分类的数据集 [1]。这个数据集分为训练和测试用的两个数据集,分别均有25,000条从IMDb下载的关于电影的评论。在每个数据集中,标签为“正面”(1)和“负面”(0)的评论数量相等。 + +### 读取数据 + +我们首先下载这个数据集到“../data”路径下,然后解压至“../data/aclImdb”下。 + +```{.python .input n=3} +# 本函数已保存在 gluonbook 包中方便以后使用。 +def download_imdb(data_dir='../data'): + url = ('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz') + sha1 = '01ada507287d82875905620988597833ad4e0903' + fname = gutils.download(url, data_dir, sha1_hash=sha1) + with tarfile.open(fname, 'r') as f: + f.extractall(data_dir) + +download_imdb() +``` + +下面,读取训练和测试数据集。每个样本是一条评论和其对应的标号,1表示正面,0表示负面。 + +```{.python .input n=13} +def read_imdb(folder='train'): # 本函数已保存在 gluonbook 包中方便以后使用。 + data = [] + for label in ['pos', 'neg']: + folder_name = os.path.join('../data/aclImdb/', folder, label) + for file in os.listdir(folder_name): + with open(os.path.join(folder_name, file), 'r') as f: + review = f.read().replace('\n', '').lower() + data.append([review, 1 if label == 'pos' else 0]) + random.shuffle(data) + return data + +train_data, test_data = read_imdb('train'), read_imdb('test') +``` + +### 数据预处理 + +接下来我们对每条评论做分词,从而得到分好词的评论。这里使用最简单的方法:基于空格进行分词。 + +```{.python .input n=14} +def get_tokenized_imdb(data): # 本函数已保存在 gluonbook 包中方便以后使用。 + def tokenizer(text): + return [tok.lower() for tok in text.split(' ')] + return [tokenizer(review) for review, _ in data] +``` + +现在,我们可以根据分好词的训练数据集来创建词典了。这里我们过滤了出现次数少于5的词。 + +```{.python .input n=28} +def get_vocab_imdb(data): # 本函数已保存在 gluonbook 包中方便以后使用。 + tokenized_data = get_tokenized_imdb(data) + counter = collections.Counter([tk for st in tokenized_data for tk in st]) + return text.vocab.Vocabulary(counter, min_freq=5) + +vocab = get_vocab_imdb(train_data) +'# words in vocab:', len(vocab) +``` + +因为每条评论长度不一致使得不能直接组合成小批量,下面函数将每条评论进行分词,通过词典转换成词索引,然后通过剪裁或者补0(0在词典中代表未知符号)来将每条评论长度固定成500。 + +```{.python .input n=44} +def preprocess_imdb(data, vocab): # 本函数已保存在 gluonbook 包中方便以后使用。 + max_l = 500 # 将每条评论通过截断或者补 0 来使得长固定。 + pad = lambda x: x[:max_l] if len(x) > max_l else x + [0] * (max_l-len(x)) + tokenized_data = get_tokenized_imdb(data) + features = nd.array([pad(vocab.to_indices(x)) for x in tokenized_data]) + labels = nd.array([score for _, score in data]) + return features, labels +``` + +### 数据迭代器 + +现在,我们创建数据迭代器来每次返回一个小批量数据。 + +```{.python .input} +batch_size = 64 +train_set = gdata.ArrayDataset(*preprocess_imdb(train_data, vocab)) +test_set = gdata.ArrayDataset(*preprocess_imdb(test_data, vocab)) +train_iter = gdata.DataLoader(train_set, batch_size, shuffle=True) +test_iter = gdata.DataLoader(test_set, batch_size) +``` + +打印第一个小批量数据形状,和训练集中小批量的个数。 + +```{.python .input} +for X, y in train_iter: + print('X', X.shape, 'y', y.shape) + break +'#batches:', len(train_iter) +``` + +## 使用循环神经网络的模型 + +在这个模型中,我们先将每个词通过嵌入层来得到特征向量。然后,我们使用双向循环神经网络对特征序列进一步编码得到序列信息。最后,我们将编码的序列信息通过全连接层变换为输出。下面实现这个模型,其中的`Embedding`实例即嵌入层,`LSTM`实例即对句子编码信息的隐藏层,`Dense`实例即生成分类结果的输出层。 + +```{.python .input n=46} +class BiRNN(nn.Block): + def __init__(self, vocab, embed_size, num_hiddens, num_layers, **kwargs): + super(BiRNN, self).__init__(**kwargs) + self.embedding = nn.Embedding(len(vocab), embed_size) + self.encoder = rnn.LSTM(num_hiddens, num_layers=num_layers, + bidirectional=True, input_size=embed_size) + self.decoder = nn.Dense(2) + + def forward(self, inputs): + # inputs 形状是(批量大小,词数),因为 LSTM 需要将序列作为第一维, + # 所以将输入转置后再提取词特征,输出形状为(词数,批量大小,词向量长度)。 + embeddings = self.embedding(inputs.T) + # states 形状是(词数,批量大小,2*隐藏单元个数)。 + states = self.encoder(embeddings) + # 连结初始时间步和最终时间步的隐藏状态作为全连接层输入。 + # 它的形状为(批量大小,2*隐藏单元个数)。 + encoding = nd.concat(states[0], states[-1]) + outputs = self.decoder(encoding) + return outputs +``` + +创建一个两层的双向循环神经网络。 + +```{.python .input} +embed_size, num_hiddens, num_layers, ctx = 100, 100, 2, gb.try_all_gpus() +net = BiRNN(vocab, embed_size, num_hiddens, num_layers) +net.initialize(init.Xavier(), ctx=ctx) +``` + +### 加载预训练的词向量 + +由于情感分类的训练数据集并不是很大,为应对过拟合现象,我们将直接使用在更大规模语料上预训练的词向量作为每个词的特征向量。这里,我们为词典`vocab`中的每个词加载GloVe词向量(每个词向量为100维向量)。 + +```{.python .input n=45} +glove_embedding = text.embedding.create( + 'glove', pretrained_file_name='glove.6B.100d.txt', vocabulary=vocab) +``` + +然后我们将用这些词向量作为评论中每个词的特征向量。注意预训练的词向量的长度需要跟创建的模型中的嵌入层输出大小一致。此外,在训练中我们不再更新这些词向量。 + +```{.python .input n=47} +net.embedding.weight.set_data(glove_embedding.idx_to_vec) +net.embedding.collect_params().setattr('grad_req', 'null') +``` + +### 训练并评价模型 + +这时候我们可以开始训练了。 + +```{.python .input n=48} +lr, num_epochs = 0.8, 5 +trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) +loss = gloss.SoftmaxCrossEntropyLoss() +gb.train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs) +``` + +最后,定义预测函数。 + +```{.python .input n=49} +# 本函数已保存在 gluonbook 包中方便以后使用。 +def predict_sentiment(net, vocab, sentence): + sentence = nd.array(vocab.to_indices(sentence), ctx=gb.try_gpu()) + label = nd.argmax(net(sentence.reshape((1, -1))), axis=1) + return 'positive' if label.asscalar() == 1 else 'negative' +``` + +然后使用训练好的模型对两个简单句子的情感进行分类。 + +```{.python .input n=50} +predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great']) +``` + +```{.python .input} +predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'bad']) +``` + +## 小结 + +* 我们可以应用预训练的词向量和循环神经网络对文本的情感进行分类。 + + +## 练习 + +* 把迭代周期改大。你的模型能在训练和测试数据集上得到怎样的准确率?通过调节超参数,你能进一步提升分类准确率吗? + +* 使用更大的预训练词向量,例如300维的GloVe词向量,能否提升分类准确率? + +* 使用spaCy分词工具,能否提升分类准确率?。你需要安装spaCy:`pip install spacy`,并且安装英文包:`python -m spacy download en`。在代码中,先导入spacy:`import spacy`。然后加载spacy英文包:`spacy_en = spacy.load('en')`。最后定义函数:`def tokenizer(text): return [tok.text for tok in spacy_en.tokenizer(text)]`替换原来的基于空格分词的`tokenizer`函数。需要注意的是,GloVe的词向量对于名词词组的存储方式是用“-”连接各个单词,例如词组“new york”在GloVe中的表示为“new-york”。而使用spacy分词之后“new york”的存储可能是“new york”。 + +* 通过上面三种方法,你能使模型在测试集上的准确率提高到0.85以上吗? + + +## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/6155) + + +![](../img/qr_sentiment-analysis.svg) + + +## 参考文献 + +[1] Maas, A. L., Daly, R. E., Pham, P. T., Huang, D., Ng, A. Y., & Potts, C. (2011, June). Learning word vectors for sentiment analysis. In Proceedings of the 49th annual meeting of the association for computational linguistics: Human language technologies-volume 1 (pp. 142-150). Association for Computational Linguistics. diff --git a/chapter_natural-language-processing/sentiment-analysis.md b/chapter_natural-language-processing/sentiment-analysis.md index cc7f8c9..c8627b3 100644 --- a/chapter_natural-language-processing/sentiment-analysis.md +++ b/chapter_natural-language-processing/sentiment-analysis.md @@ -69,7 +69,7 @@ def get_tokenized_imdb(data): # 本函数已保存在 gluonbook 包中方便以 现在,我们可以根据分好词的训练数据集来创建词典了。这里我们过滤了出现次数少于5的词。 ```{.python .input n=28} -def get_vocab_imdb(data): +def get_vocab_imdb(data): # 本函数已保存在 gluonbook 包中方便以后使用。 tokenized_data = get_tokenized_imdb(data) counter = collections.Counter([tk for st in tokenized_data for tk in st]) return text.vocab.Vocabulary(counter, min_freq=5) diff --git a/chapter_recurrent-neural-networks/deep-rnn.md b/chapter_recurrent-neural-networks/deep-rnn.md index 55879aa..7a278a0 100644 --- a/chapter_recurrent-neural-networks/deep-rnn.md +++ b/chapter_recurrent-neural-networks/deep-rnn.md @@ -34,7 +34,7 @@ $$\boldsymbol{O}_t = \boldsymbol{H}_t^{(L)} \boldsymbol{W}_{hq} + \boldsymbol{b} ## 练习 -* 将[“基于循环神经网络的语言模型”](rnn-lang-model.md)一节中的模型改为含有2个隐藏层的循环神经网络。观察并分析实验现象。 +* 将[“循环神经网络的从零开始实现”](rnn-scratch.md)一节中的模型改为含有2个隐藏层的循环神经网络。观察并分析实验现象。 ## 扫码直达[讨论区](https://discuss.gluon.ai/t/topic/6730) diff --git a/chapter_recurrent-neural-networks/rnn-scratch.md b/chapter_recurrent-neural-networks/rnn-scratch.md index b4280cf..1f9f064 100644 --- a/chapter_recurrent-neural-networks/rnn-scratch.md +++ b/chapter_recurrent-neural-networks/rnn-scratch.md @@ -106,7 +106,7 @@ def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx): state = init_rnn_state(1, num_hiddens, ctx) output = [char_to_idx[prefix[0]]] - for t in range(num_chars + len(prefix)): + for t in range(num_chars + len(prefix) - 1): # 将上一时间步的输出作为当前时间步的输入。 X = to_onehot(nd.array([output[-1]], ctx=ctx), vocab_size) # 计算输出和更新隐藏状态。 @@ -119,7 +119,7 @@ def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state, return ''.join([idx_to_char[i] for i in output]) ``` -测试一下这个函数。因为模型参数为随机值,所以预测结果也是随机的。 +我们先测试一下`predict_rnn`函数。我们以“分开”作为前缀依次生成10个字符来创作歌词。因为模型参数为随机值,所以预测结果也是随机的。 ```{.python .input n=9} predict_rnn('分开', 10, rnn, params, init_rnn_state, num_hiddens, vocab_size, diff --git a/gluonbook/utils.py b/gluonbook/utils.py index 9b19e60..ce525bf 100644 --- a/gluonbook/utils.py +++ b/gluonbook/utils.py @@ -1,18 +1,18 @@ import collections -import random -import zipfile import math import os +import random import sys import tarfile import time +import zipfile from IPython import display from matplotlib import pyplot as plt import mxnet as mx -from mxnet import autograd, gluon, image, nd, init +from mxnet import autograd, gluon, image, init, nd from mxnet.contrib import text -from mxnet.gluon import nn, data as gdata, loss as gloss, utils as gutils +from mxnet.gluon import data as gdata, loss as gloss, nn, utils as gutils import numpy as np @@ -165,6 +165,7 @@ def evaluate_accuracy(data_iter, net, ctx=[mx.cpu()]): acc.wait_to_read() return acc.asscalar() / n + def _get_batch(batch, ctx): """Return features and labels on ctx.""" features, labels = batch @@ -174,6 +175,7 @@ def _get_batch(batch, ctx): gutils.split_and_load(labels, ctx), features.shape[0]) + def get_data_ch7(): """Get the data set used in Chapter 7.""" data = np.genfromtxt('../data/airfoil_self_noise.dat', delimiter='\t') @@ -194,12 +196,14 @@ def get_tokenized_imdb(data): return [tok.lower() for tok in text.split(' ')] return [tokenizer(review) for review, _ in data] + def get_vocab_imdb(data): - """Get the vocab for the IMBD data set for sentiment analysis.""" + """Get the vocab for the IMDB data set for sentiment analysis.""" tokenized_data = get_tokenized_imdb(data) counter = collections.Counter([tk for st in tokenized_data for tk in st]) return text.vocab.Vocabulary(counter, min_freq=5) + def grad_clipping(params, theta, ctx): """Clip the gradient.""" if theta is not None: @@ -282,12 +286,13 @@ def _make_list(obj, default_values=None): obj = [obj] return obj + def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx): """Predict next chars with a RNN model""" state = init_rnn_state(1, num_hiddens, ctx) output = [char_to_idx[prefix[0]]] - for t in range(num_chars + len(prefix)): + for t in range(num_chars + len(prefix) - 1): X = to_onehot(nd.array([output[-1]], ctx=ctx), vocab_size) (Y, state) = rnn(X, state, params) if t < len(prefix) - 1: @@ -318,6 +323,7 @@ def predict_sentiment(net, vocab, sentence): label = nd.argmax(net(sentence.reshape((1, -1))), axis=1) return 'positive' if label.asscalar() == 1 else 'negative' + def preprocess_imdb(data, vocab): """Preprocess the IMDB data set for sentiment analysis.""" max_l = 500 @@ -327,6 +333,7 @@ def preprocess_imdb(data, vocab): labels = nd.array([score for _, score in data]) return features, labels + def read_imdb(folder='train'): """Read the IMDB data set for sentiment analysis.""" data = [] @@ -339,6 +346,7 @@ def read_imdb(folder='train'): random.shuffle(data) return data + def read_voc_images(root='../data/VOCdevkit/VOC2012', train=True): """Read VOC images.""" txt_fname = '%s/ImageSets/Segmentation/%s' % ( @@ -528,6 +536,7 @@ def train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs): % (epoch, train_l_sum / n, train_acc_sum / m, test_acc, time.time() - start)) + def train_2d(trainer): """Train a 2d object function with a customized trainer""" x1, x2 = -5, -2 @@ -539,6 +548,7 @@ def train_2d(trainer): print('epoch %d, x1 %f, x2 %f' % (i+1, x1, x2)) return res + def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, ctx, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, @@ -802,3 +812,4 @@ class VOCSegDataset(gdata.Dataset): def __len__(self): return len(self.data) + -- GitLab