提交 83bf14fd 编写于 作者: X xiaotinghe 提交者: Aston Zhang

update ptb (#290)

* Add files via upload

* Add files via upload

* Add files via upload

* Add files via upload

* Add files via upload

* Add files via upload

* Update sentiment-analysis-cnn.md

* Update sentiment-analysis-cnn.md

* Update sentiment-analysis-cnn.md

* Update sentiment-analysis-cnn.md

* Update sentiment-analysis-cnn.md

* Add files via upload

* Update sentiment-analysis-cnn.md

* Update sentiment-analysis-cnn.md

* Update sentiment-analysis-cnn.md

* Update sentiment-analysis.md

* Update sentiment-analysis-cnn.md

* Update PTB

* update ptb
上级 e9256ce0
......@@ -17,8 +17,8 @@ import collections
import gluonbook as gb
import mxnet as mx
from mxnet import autograd, gluon, init, metric, nd
from mxnet.gluon import loss as gloss, nn, rnn
from mxnet.contrib import text
from mxnet.gluon import loss as gloss, nn, rnn
import os
import random
import zipfile
......@@ -44,7 +44,7 @@ if demo:
下面,读取训练和测试数据集。
```{.python .input}
```{.python .input n=3}
def readIMDB(dir_url, seg='train'):
pos_or_neg = ['pos', 'neg']
data = []
......@@ -90,7 +90,7 @@ for review, score in test_data:
现在,我们可以根据分好词的训练数据集来创建词典了。这里我们设置了特殊符号“<unk>”(unknown)。它将表示一切不存在于训练数据集词典中的词。
```{.python .input n=6}
```{.python .input n=5}
token_counter = collections.Counter()
def count_token(train_tokenized):
for sample in train_tokenized:
......@@ -109,7 +109,7 @@ vocab = text.vocab.Vocabulary(token_counter, unknown_token='<unk>',
下面,我们继续对数据进行预处理。每个不定长的评论将被特殊符号`PAD`补成长度为`maxlen`的序列,并用NDArray表示。
```{.python .input n=7}
```{.python .input n=6}
def encode_samples(tokenized_samples, vocab):
features = []
for sample in tokenized_samples:
......
......@@ -12,12 +12,15 @@
```{.python .input n=1}
import sys
sys.path.append('..')
import collections
import gluonbook as gb
import math
import mxnet as mx
from mxnet import autograd, gluon, init, nd
from mxnet.contrib import text
from mxnet.gluon import loss as gloss, nn, rnn, utils as gutils
import numpy as np
import os
import time
import zipfile
......@@ -27,61 +30,81 @@ with zipfile.ZipFile('../data/ptb.zip', 'r') as zin:
## 建立词语索引
下面定义了`Dictionary`类来映射词语和整数索引
下面,读取数据集
```{.python .input n=2}
class Dictionary(object):
def __init__(self):
self.word_to_idx = {}
self.idx_to_word = []
```{.python .input}
def readPTB(dir_url, seg='train'):
data = []
with open(os.path.join('../data', dir_url, 'ptb.' + seg + '.txt'), 'r', encoding='utf8') as rf:
for line in rf:
data.append(line)
return data
train_data = readPTB('ptb', 'train')
valid_data = readPTB('ptb', 'valid')
test_data = readPTB('ptb', 'test')
```
## 分词
def add_word(self, word):
if word not in self.word_to_idx:
self.idx_to_word.append(word)
self.word_to_idx[word] = len(self.idx_to_word) - 1
return self.word_to_idx[word]
接下来我们对每篇文章做分词。
def __len__(self):
return len(self.idx_to_word)
```{.python .input}
def tokenizer(text):
return [tok.lower() for tok in text.split()] + ['<eos>']
train_tokenized = []
for content in train_data:
train_tokenized.append(tokenizer(content))
valid_tokenized = []
for content in valid_data:
valid_tokenized.append(tokenizer(content))
test_tokenized = []
for content in test_data:
test_tokenized.append(tokenizer(content))
```
以下的`Corpus`类按照读取的文本数据集建立映射词语和索引的词典,并将文本转换成词语索引的序列。这样,每个文本数据集就变成了NDArray格式的整数序列。
```{.python .input n=3}
class Corpus(object):
def __init__(self, path):
self.dictionary = Dictionary()
self.train = self.tokenize(path + 'train.txt')
self.valid = self.tokenize(path + 'valid.txt')
self.test = self.tokenize(path + 'test.txt')
def tokenize(self, path):
# 将词语添加至词典。
with open(path, 'r') as f:
num_words = 0
for line in f:
words = line.split() + ['<eos>']
num_words += len(words)
for word in words:
self.dictionary.add_word(word)
# 将文本转换成词语索引的序列( NDArray 格式)。
with open(path, 'r') as f:
indices = np.zeros((num_words,), dtype='int32')
idx = 0
for line in f:
words = line.split() + ['<eos>']
for word in words:
indices[idx] = self.dictionary.word_to_idx[word]
idx += 1
return nd.array(indices, dtype='int32')
## 创建词典
现在,我们根据分好词的训练数据集来创建词典了。特殊符号“<unk>”(unknown)表示一切不存在于训练数据集词典中的词。
```{.python .input}
token_counter = collections.Counter()
def count_token(tokenized):
for sample in train_tokenized:
for token in sample:
if token not in token_counter:
token_counter[token] = 1
else:
token_counter[token] += 1
count_token(train_tokenized)
vocab = text.vocab.Vocabulary(token_counter, unknown_token='<unk>',
reserved_tokens=None)
```
## 建立词语索引
将文本转换成词语索引的序列。这样,每个文本数据集就变成了 NDArray 格式的整数序列。
```{.python .input}
def encode_samples(tokenized_samples, vocab):
feature = []
for sample in tokenized_samples:
for token in sample:
feature.append(vocab.token_to_idx[token])
return feature
ctx = gb.try_gpu()
train_features = nd.array(encode_samples(train_tokenized, vocab), dtype='int32', ctx=ctx)
valid_features = nd.array(encode_samples(valid_tokenized, vocab), dtype='int32', ctx=ctx)
test_features = nd.array(encode_samples(test_tokenized, vocab), dtype='int32', ctx=ctx)
```
看一下词典的大小。
```{.python .input n=4}
data = '../data/ptb/ptb.'
corpus = Corpus(data)
vocab_size = len(corpus.dictionary)
vocab_size = len(vocab)
vocab_size
```
......@@ -114,8 +137,8 @@ class RNNModel(nn.Block):
self.rnn = rnn.GRU(num_hiddens, num_layers, dropout=drop_prob,
input_size=embed_size)
else:
raise ValueError('Invalid mode %s. Options are rnn_relu, '
'rnn_tanh, lstm, and gru' % mode)
raise ValueError("Invalid mode %s. Options are rnn_relu, "
"rnn_tanh, lstm, and gru" % mode)
self.dense = nn.Dense(vocab_size, in_units=num_hiddens)
self.num_hiddens = num_hiddens
......@@ -140,14 +163,13 @@ embed_size = 100
num_hiddens = 100
num_layers = 2
lr = 0.5
clipping_theta = 0.2
clipping_theta = 0.4
num_epochs = 2
batch_size = 32
num_steps = 5
drop_prob = 0.2
eval_period = 1000
ctx = gb.try_gpu()
model = RNNModel(model_name, vocab_size, embed_size, num_hiddens, num_layers,
drop_prob)
model.initialize(init.Xavier(), ctx=ctx)
......@@ -163,18 +185,18 @@ loss = gloss.SoftmaxCrossEntropyLoss()
```{.python .input n=7}
def batchify(data, batch_size):
num_batches = data.shape[0] // batch_size
data = data[: num_batches * batch_size]
data = data[:num_batches*batch_size]
data = data.reshape((batch_size, num_batches)).T
return data
train_data = batchify(corpus.train, batch_size).as_in_context(ctx)
val_data = batchify(corpus.valid, batch_size).as_in_context(ctx)
test_data = batchify(corpus.test, batch_size).as_in_context(ctx)
train_data = batchify(train_features, batch_size).as_in_context(ctx)
val_data = batchify(valid_features, batch_size).as_in_context(ctx)
test_data = batchify(test_features, batch_size).as_in_context(ctx)
def get_batch(source, i):
seq_len = min(num_steps, source.shape[0] - 1 - i)
X = source[i : i + seq_len]
Y = source[i + 1 : i + 1 + seq_len]
seq_len = min(num_steps, source.shape[0]-1-i)
X = source[i : i+seq_len]
Y = source[i+1 : i+1+seq_len]
return X, Y.reshape((-1,))
```
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册