......@@ -94,7 +94,9 @@ Dive into Deep Learning with PyTorch.
[10.3 word2vec的实现](https://github.com/ShusenTang/Dive-into-DL-PyTorch/blob/master/docs/chapter10_natural-language-processing/10.3_word2vec-pytorch.md)
[10.4 子词嵌入(fastText)](https://github.com/ShusenTang/Dive-into-DL-PyTorch/blob/master/docs/chapter10_natural-language-processing/10.4_fasttext.md)
[10.5 全局向量的词嵌入(GloVe)](https://github.com/ShusenTang/Dive-into-DL-PyTorch/blob/master/docs/chapter10_natural-language-processing/10.5_glove.md)
[10.6 求近义词和类比词](https://github.com/ShusenTang/Dive-into-DL-PyTorch/blob/master/docs/chapter10_natural-language-processing/10.6_similarity-analogy.md)
[10.7 文本情感分类:使用循环神经网络](https://github.com/ShusenTang/Dive-into-DL-PyTorch/blob/master/docs/chapter10_natural-language-processing/10.7_sentiment-analysis-rnn.md)
......@@ -15,7 +15,8 @@ from torch import nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torchtext.vocab as vocab
import torchtext
import torchtext.vocab as Vocab
import numpy as np
......@@ -720,4 +721,64 @@ def bbox_to_rect(bbox, color):
# ((左上x, 左上y), 宽, 高)
return d2l.plt.Rectangle(
xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1],
fill=False, edgecolor=color, linewidth=2)
fill=False, edgecolor=color, linewidth=2)
# ############################# 10.7 ##########################
def read_imdb(folder='train', data_root="/S1/CSCL/tangss/Datasets/aclImdb"):
data = []
for label in ['pos', 'neg']:
folder_name = os.path.join(data_root, folder, label)
for file in tqdm(os.listdir(folder_name)):
with open(os.path.join(folder_name, file), 'rb') as f:
review = f.read().decode('utf-8').replace('\n', '').lower()
data.append([review, 1 if label == 'pos' else 0])
return data
def get_tokenized_imdb(data):
data: list of [string, label]
def tokenizer(text):
return [tok.lower() for tok in text.split(' ')]
return [tokenizer(review) for review, _ in data]
def get_vocab_imdb(data):
tokenized_data = get_tokenized_imdb(data)
counter = collections.Counter([tk for st in tokenized_data for tk in st])
return torchtext.vocab.Vocab(counter, min_freq=5)
def preprocess_imdb(data, vocab):
max_l = 500 # 将每条评论通过截断或者补0,使得长度变成500
def pad(x):
return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))
tokenized_data = get_tokenized_imdb(data)
features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
labels = torch.tensor([score for _, score in data])
return features, labels
def load_pretrained_embedding(words, pretrained_vocab):
embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化为0
oov_count = 0 # out of vocabulary
for i, word in enumerate(words):
idx = pretrained_vocab.stoi[word]
embed[i, :] = pretrained_vocab.vectors[idx]
except KeyError:
oov_count += 0
if oov_count > 0:
print("There are %d oov words.")
return embed
def predict_sentiment(net, vocab, sentence):
sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
label = torch.argmax(net(sentence.view((1, -1))), dim=1)
return 'positive' if label.item() == 1 else 'negative'
# 文本情感分类:使用循环神经网络
# 10.7 文本情感分类:使用循环神经网络
......@@ -6,45 +6,52 @@
```{.python .input n=2}
``` python
import collections
import d2lzh as d2l
from mxnet import gluon, init, nd
from mxnet.contrib import text
from mxnet.gluon import data as gdata, loss as gloss, nn, rnn, utils as gutils
import os
import random
import tarfile
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import sys
import d2lzh_pytorch as d2l
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATA_ROOT = "/S1/CSCL/tangss/Datasets"
## 文本情感分类数据
## 10.7.1 文本情感分类数据
我们使用斯坦福的IMDb数据集(Stanford's Large Movie Review Dataset)作为文本情感分类的数据集 [1]。这个数据集分为训练和测试用的两个数据集,分别包含25,000条从IMDb下载的关于电影的评论。在每个数据集中,标签为“正面”和“负面”的评论数量相等。
### 读取数据
### 读取数据
```{.python .input n=3}
# 本函数已保存在d2lzh包中方便以后使用
def download_imdb(data_dir='../data'):
url = ('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')
sha1 = '01ada507287d82875905620988597833ad4e0903'
fname = gutils.download(url, data_dir, sha1_hash=sha1)
``` python
fname = os.path.join(DATA_ROOT, "aclImdb_v1.tar.gz")
if not os.path.exists(os.path.join(DATA_ROOT, "aclImdb")):
with tarfile.open(fname, 'r') as f:
```{.python .input n=13}
def read_imdb(folder='train'): # 本函数已保存在d2lzh包中方便以后使用
``` python
from tqdm import tqdm
# 本函数已保存在d2lzh_pytorch包中方便以后使用
def read_imdb(folder='train', data_root="/S1/CSCL/tangss/Datasets/aclImdb"):
data = []
for label in ['pos', 'neg']:
folder_name = os.path.join('../data/aclImdb/', folder, label)
for file in os.listdir(folder_name):
folder_name = os.path.join(data_root, folder, label)
for file in tqdm(os.listdir(folder_name)):
with open(os.path.join(folder_name, file), 'rb') as f:
review = f.read().decode('utf-8').replace('\n', '').lower()
data.append([review, 1 if label == 'pos' else 0])
......@@ -54,12 +61,16 @@ def read_imdb(folder='train'): # 本函数已保存在d2lzh包中方便以后
train_data, test_data = read_imdb('train'), read_imdb('test')
### 预处理数据
### 预处理数据
```{.python .input n=14}
def get_tokenized_imdb(data): # 本函数已保存在d2lzh包中方便以后使用
``` python
# 本函数已保存在d2lzh_pytorch包中方便以后使用
def get_tokenized_imdb(data):
data: list of [string, label]
def tokenizer(text):
return [tok.lower() for tok in text.split(' ')]
return [tokenizer(review) for review, _ in data]
......@@ -67,132 +78,173 @@ def get_tokenized_imdb(data): # 本函数已保存在d2lzh包中方便以后使
```{.python .input n=28}
def get_vocab_imdb(data): # 本函数已保存在d2lzh包中方便以后使用
``` python
# 本函数已保存在d2lzh_pytorch包中方便以后使用
def get_vocab_imdb(data):
tokenized_data = get_tokenized_imdb(data)
counter = collections.Counter([tk for st in tokenized_data for tk in st])
return text.vocab.Vocabulary(counter, min_freq=5)
return Vocab.Vocab(counter, min_freq=5)
vocab = get_vocab_imdb(train_data)
'# words in vocab:', len(vocab)
('# words in vocab:', 46151)
```{.python .input n=44}
def preprocess_imdb(data, vocab): # 本函数已保存在d2lzh包中方便以后使用
``` python
# 本函数已保存在d2lzh_torch包中方便以后使用
def preprocess_imdb(data, vocab):
max_l = 500 # 将每条评论通过截断或者补0,使得长度变成500
def pad(x):
return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))
tokenized_data = get_tokenized_imdb(data)
features = nd.array([pad(vocab.to_indices(x)) for x in tokenized_data])
labels = nd.array([score for _, score in data])
features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
labels = torch.tensor([score for _, score in data])
return features, labels
### 创建数据迭代器
### 创建数据迭代器
```{.python .input}
``` python
batch_size = 64
train_set = gdata.ArrayDataset(*preprocess_imdb(train_data, vocab))
test_set = gdata.ArrayDataset(*preprocess_imdb(test_data, vocab))
train_iter = gdata.DataLoader(train_set, batch_size, shuffle=True)
test_iter = gdata.DataLoader(test_set, batch_size)
train_set = Data.TensorDataset(*preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)
```{.python .input}
``` python
for X, y in train_iter:
print('X', X.shape, 'y', y.shape)
'#batches:', len(train_iter)
X torch.Size([64, 500]) y torch.Size([64])
('#batches:', 391)
## 使用循环神经网络的模型
## 10.7.2 使用循环神经网络的模型
```{.python .input n=46}
class BiRNN(nn.Block):
def __init__(self, vocab, embed_size, num_hiddens, num_layers, **kwargs):
super(BiRNN, self).__init__(**kwargs)
``` python
class BiRNN(nn.Module):
def __init__(self, vocab, embed_size, num_hiddens, num_layers):
super(BiRNN, self).__init__()
self.embedding = nn.Embedding(len(vocab), embed_size)
# bidirectional设为True即得到双向循环神经网络
self.encoder = rnn.LSTM(num_hiddens, num_layers=num_layers,
bidirectional=True, input_size=embed_size)
self.decoder = nn.Dense(2)
self.encoder = nn.LSTM(input_size=embed_size,
# 初始时间步和最终时间步的隐藏状态作为全连接层输入
self.decoder = nn.Linear(4*num_hiddens, 2)
def forward(self, inputs):
# inputs的形状是(批量大小, 词数),因为LSTM需要将序列作为第一维,所以将输入转置后
# inputs的形状是(批量大小, 词数),因为LSTM需要将序列长度(seq_len)作为第一维,所以将输入转置后
# 再提取词特征,输出形状为(词数, 批量大小, 词向量维度)
embeddings = self.embedding(inputs.T)
# states形状是(词数, 批量大小, 2 * 隐藏单元个数)
states = self.encoder(embeddings)
embeddings = self.embedding(inputs.permute(1, 0))
# rnn.LSTM只传入输入embeddings,因此只返回最后一层的隐藏层在各时间步的隐藏状态。
# outputs形状是(词数, 批量大小, 2 * 隐藏单元个数)
outputs, _ = self.encoder(embeddings) # output, (h, c)
# 连结初始时间步和最终时间步的隐藏状态作为全连接层输入。它的形状为
# (批量大小, 4 * 隐藏单元个数)。
encoding = nd.concat(states[0], states[-1])
outputs = self.decoder(encoding)
return outputs
encoding = torch.cat((outputs[0], outputs[-1]), -1)
outs = self.decoder(encoding)
return outs
```{.python .input}
embed_size, num_hiddens, num_layers, ctx = 100, 100, 2, d2l.try_all_gpus()
``` python
embed_size, num_hiddens, num_layers = 100, 100, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)
net.initialize(init.Xavier(), ctx=ctx)
### 加载预训练的词向量
### 加载预训练的词向量
```{.python .input n=45}
glove_embedding = text.embedding.create(
'glove', pretrained_file_name='glove.6B.100d.txt', vocabulary=vocab)
``` python
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, "glove"))
```{.python .input n=47}
net.embedding.collect_params().setattr('grad_req', 'null')
``` python
# 本函数已保存在d2lzh_torch包中方便以后使用
def load_pretrained_embedding(words, pretrained_vocab):
embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化为0
oov_count = 0 # out of vocabulary
for i, word in enumerate(words):
idx = pretrained_vocab.stoi[word]
embed[i, :] = pretrained_vocab.vectors[idx]
except KeyError:
oov_count += 0
if oov_count > 0:
print("There are %d oov words.")
return embed
load_pretrained_embedding(vocab.itos, glove_vocab))
net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它
### 训练并评价模型
### 训练并评价模型
```{.python .input n=48}
``` python
lr, num_epochs = 0.01, 5
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})
loss = gloss.SoftmaxCrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs)
# 要过滤掉不计算梯度的embedding参数
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)
training on cuda
epoch 1, loss 0.5759, train acc 0.666, test acc 0.832, time 250.8 sec
epoch 2, loss 0.1785, train acc 0.842, test acc 0.852, time 253.3 sec
epoch 3, loss 0.1042, train acc 0.866, test acc 0.856, time 253.7 sec
epoch 4, loss 0.0682, train acc 0.888, test acc 0.868, time 254.2 sec
epoch 5, loss 0.0483, train acc 0.901, test acc 0.862, time 251.4 sec
```{.python .input n=49}
# 本函数已保存在d2lzh包中方便以后使用
``` python
# 本函数已保存在d2lzh_pytorch包中方便以后使用
def predict_sentiment(net, vocab, sentence):
sentence = nd.array(vocab.to_indices(sentence), ctx=d2l.try_gpu())
label = nd.argmax(net(sentence.reshape((1, -1))), axis=1)
return 'positive' if label.asscalar() == 1 else 'negative'
sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
label = torch.argmax(net(sentence.view((1, -1))), dim=1)
return 'positive' if label.item() == 1 else 'negative'
```{.python .input n=50}
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great'])
``` python
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great']) # positive
```{.python .input}
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'bad'])
``` python
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'bad']) # negative
## 小结
......@@ -201,23 +253,10 @@ predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'bad'])
* 可以应用预训练的词向量和循环神经网络对文本的情感进行分类。
## 练习
* 增加迭代周期。训练后的模型能在训练和测试数据集上得到怎样的准确率?再调节其他超参数试试?
* 使用更大的预训练词向量,如300维的GloVe词向量,能否提升分类准确率?
* 使用spaCy分词工具,能否提升分类准确率?你需要安装spaCy(`pip install spacy`),并且安装英文包(`python -m spacy download en`)。在代码中,先导入spacy(`import spacy`)。然后加载spacy英文包(`spacy_en = spacy.load('en')`)。最后定义函数`def tokenizer(text): return [tok.text for tok in spacy_en.tokenizer(text)]`并替换原来的基于空格分词的`tokenizer`函数。需要注意的是,GloVe词向量对于名词词组的存储方式是用“-”连接各个单词,例如,词组“new york”在GloVe词向量中的表示为“new-york”,而使用spaCy分词之后“new york”的存储可能是“new york”。
## 参考文献
[1] Maas, A. L., Daly, R. E., Pham, P. T., Huang, D., Ng, A. Y., & Potts, C. (2011, June). Learning word vectors for sentiment analysis. In Proceedings of the 49th annual meeting of the association for computational linguistics: Human language technologies-volume 1 (pp. 142-150). Association for Computational Linguistics.
