mxnet.py 50.0 KB
Newer Older
A
Aston Zhang 已提交
1 2 3 4 5 6
# This file is generated automatically through:
#    d2lbook build lib
# Don't edit it directly

# Defined in file: ./chapter_preface/index.md
import collections
A
Aston Zhang 已提交
7
import hashlib
A
Aston Zhang 已提交
8 9 10 11 12 13 14 15 16
import math
import os
import random
import re
import shutil
import sys
import tarfile
import time
import zipfile
A
Aston Zhang 已提交
17 18 19 20 21 22 23
from collections import defaultdict

import pandas as pd
import requests
from IPython import display
from matplotlib import pyplot as plt

A
Aston Zhang 已提交
24 25 26 27 28 29 30 31
d2l = sys.modules[__name__]


# Defined in file: ./chapter_preface/index.md
from mxnet import autograd, context, gluon, image, init, np, npx
from mxnet.gluon import nn, rnn


A
dbl  
Aston Zhang 已提交
32 33 34 35 36 37 38 39 40
# Defined in file: ./chapter_preliminaries/pandas.md
def mkdir_if_not_exist(path):
    """如果目录不存在则创建"""
    if not isinstance(path, str):
        path = os.path.join(*path)
    if not os.path.exists(path):
        os.makedirs(path)


A
Aston Zhang 已提交
41
# Defined in file: ./chapter_preliminaries/calculus.md
A
Aston Zhang 已提交
42
def use_svg_display():
A
dbl  
Aston Zhang 已提交
43
    """使用svg格式在Jupyter中显示绘图。"""
A
Aston Zhang 已提交
44 45 46 47
    display.set_matplotlib_formats('svg')


# Defined in file: ./chapter_preliminaries/calculus.md
A
Aston Zhang 已提交
48
def set_figsize(figsize=(3.5, 2.5)):
A
dbl  
Aston Zhang 已提交
49
    """设置matplotlib的图表大小。"""
A
Aston Zhang 已提交
50 51 52 53 54 55
    use_svg_display()
    d2l.plt.rcParams['figure.figsize'] = figsize


# Defined in file: ./chapter_preliminaries/calculus.md
def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend):
A
dbl  
Aston Zhang 已提交
56
    """设置matplotlib的轴。"""
A
Aston Zhang 已提交
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
    axes.set_xlabel(xlabel)
    axes.set_ylabel(ylabel)
    axes.set_xscale(xscale)
    axes.set_yscale(yscale)
    axes.set_xlim(xlim)
    axes.set_ylim(ylim)
    if legend:
        axes.legend(legend)
    axes.grid()


# Defined in file: ./chapter_preliminaries/calculus.md
def plot(X, Y=None, xlabel=None, ylabel=None, legend=None, xlim=None,
         ylim=None, xscale='linear', yscale='linear',
         fmts=('-', 'm--', 'g-.', 'r:'), figsize=(3.5, 2.5), axes=None):
A
dbl  
Aston Zhang 已提交
72
    """绘制数据点。"""
A
Aston Zhang 已提交
73 74 75 76 77 78 79 80
    if legend is None:
        legend = []

    set_figsize(figsize)
    axes = axes if axes else d2l.plt.gca()

    # Return True if `X` (tensor or list) has 1 axis
    def has_one_axis(X):
A
dbl  
Aston Zhang 已提交
81 82
        return (hasattr(X, "ndim") and X.ndim == 1 or
                isinstance(X, list) and not hasattr(X[0], "__len__"))
A
Aston Zhang 已提交
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101

    if has_one_axis(X):
        X = [X]
    if Y is None:
        X, Y = [[]] * len(X), X
    elif has_one_axis(Y):
        Y = [Y]
    if len(X) != len(Y):
        X = X * len(Y)
    axes.cla()
    for x, y, fmt in zip(X, Y, fmts):
        if len(x):
            axes.plot(x, y, fmt)
        else:
            axes.plot(y, fmt)
    set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend)


# Defined in file: ./chapter_linear-networks/linear-regression.md
A
Aston Zhang 已提交
102
class Timer:
A
dbl  
Aston Zhang 已提交
103
    """记录多次运行时间。"""
A
Aston Zhang 已提交
104 105 106 107 108
    def __init__(self):
        self.times = []
        self.start()

    def start(self):
A
dbl  
Aston Zhang 已提交
109
        """启动计时器。"""
A
Aston Zhang 已提交
110 111 112
        self.tik = time.time()

    def stop(self):
A
dbl  
Aston Zhang 已提交
113
        """停止计时器并将时间记录在列表中。"""
A
Aston Zhang 已提交
114 115 116 117
        self.times.append(time.time() - self.tik)
        return self.times[-1]

    def avg(self):
A
dbl  
Aston Zhang 已提交
118
        """返回平均时间。"""
A
Aston Zhang 已提交
119 120 121
        return sum(self.times) / len(self.times)

    def sum(self):
A
dbl  
Aston Zhang 已提交
122
        """返回时间总和。"""
A
Aston Zhang 已提交
123 124 125
        return sum(self.times)

    def cumsum(self):
A
dbl  
Aston Zhang 已提交
126
        """返回累计时间。"""
A
Aston Zhang 已提交
127 128 129 130
        return np.array(self.times).cumsum().tolist()


# Defined in file: ./chapter_linear-networks/linear-regression-scratch.md
A
Aston Zhang 已提交
131
def synthetic_data(w, b, num_examples):
A
dbl  
Aston Zhang 已提交
132
    """生成 y = Xw + b + 噪声。"""
A
Aston Zhang 已提交
133 134 135 136 137 138 139
    X = d2l.normal(0, 1, (num_examples, len(w)))
    y = d2l.matmul(X, w) + b
    y += d2l.normal(0, 0.01, y.shape)
    return X, d2l.reshape(y, (-1, 1))


# Defined in file: ./chapter_linear-networks/linear-regression-scratch.md
A
Aston Zhang 已提交
140
def linreg(X, w, b):
A
dbl  
Aston Zhang 已提交
141
    """线性回归模型。"""
A
Aston Zhang 已提交
142 143 144 145
    return d2l.matmul(X, w) + b


# Defined in file: ./chapter_linear-networks/linear-regression-scratch.md
A
Aston Zhang 已提交
146
def squared_loss(y_hat, y):
A
dbl  
Aston Zhang 已提交
147 148
    """均方损失。"""
    return (y_hat - d2l.reshape(y, y_hat.shape))**2 / 2
A
Aston Zhang 已提交
149 150 151


# Defined in file: ./chapter_linear-networks/linear-regression-scratch.md
A
Aston Zhang 已提交
152
def sgd(params, lr, batch_size):
A
dbl  
Aston Zhang 已提交
153
    """小批量随机梯度下降。"""
A
Aston Zhang 已提交
154 155 156 157 158
    for param in params:
        param[:] = param - lr * param.grad / batch_size


# Defined in file: ./chapter_linear-networks/linear-regression-concise.md
A
Aston Zhang 已提交
159
def load_array(data_arrays, batch_size, is_train=True):
A
dbl  
Aston Zhang 已提交
160
    """构造一个Gluon数据迭代器。"""
A
Aston Zhang 已提交
161 162 163 164 165
    dataset = gluon.data.ArrayDataset(*data_arrays)
    return gluon.data.DataLoader(dataset, batch_size, shuffle=is_train)


# Defined in file: ./chapter_linear-networks/image-classification-dataset.md
A
Aston Zhang 已提交
166
def get_fashion_mnist_labels(labels):
A
dbl  
Aston Zhang 已提交
167 168 169 170
    """返回Fashion-MNIST数据集的文本标签。"""
    text_labels = [
        't-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt',
        'sneaker', 'bag', 'ankle boot']
A
Aston Zhang 已提交
171 172 173 174
    return [text_labels[int(i)] for i in labels]


# Defined in file: ./chapter_linear-networks/image-classification-dataset.md
A
Aston Zhang 已提交
175
def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5):
A
dbl  
Aston Zhang 已提交
176
    """绘制图像列表。"""
A
Aston Zhang 已提交
177 178 179 180 181 182 183 184 185 186 187 188 189
    figsize = (num_cols * scale, num_rows * scale)
    _, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize)
    axes = axes.flatten()
    for i, (ax, img) in enumerate(zip(axes, imgs)):
        ax.imshow(d2l.numpy(img))
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)
        if titles:
            ax.set_title(titles[i])
    return axes


# Defined in file: ./chapter_linear-networks/image-classification-dataset.md
A
Aston Zhang 已提交
190
def get_dataloader_workers():
A
dbl  
Aston Zhang 已提交
191
    """在非Windows的平台上,使用4个进程来读取的数据。"""
A
Aston Zhang 已提交
192 193 194 195
    return 0 if sys.platform.startswith('win') else 4


# Defined in file: ./chapter_linear-networks/image-classification-dataset.md
A
Aston Zhang 已提交
196
def load_data_fashion_mnist(batch_size, resize=None):
A
dbl  
Aston Zhang 已提交
197
    """下载Fashion-MNIST数据集,然后将其加载到内存中。"""
A
Aston Zhang 已提交
198 199 200 201 202 203 204 205 206 207 208 209 210 211
    dataset = gluon.data.vision
    trans = [dataset.transforms.ToTensor()]
    if resize:
        trans.insert(0, dataset.transforms.Resize(resize))
    trans = dataset.transforms.Compose(trans)
    mnist_train = dataset.FashionMNIST(train=True).transform_first(trans)
    mnist_test = dataset.FashionMNIST(train=False).transform_first(trans)
    return (gluon.data.DataLoader(mnist_train, batch_size, shuffle=True,
                                  num_workers=get_dataloader_workers()),
            gluon.data.DataLoader(mnist_test, batch_size, shuffle=False,
                                  num_workers=get_dataloader_workers()))


# Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
A
Aston Zhang 已提交
212
def accuracy(y_hat, y):
A
dbl  
Aston Zhang 已提交
213
    """计算预测正确的数量。"""
A
Aston Zhang 已提交
214
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
A
dbl  
Aston Zhang 已提交
215
        y_hat = d2l.argmax(y_hat, axis=1)
A
Aston Zhang 已提交
216 217 218 219 220
    cmp = d2l.astype(y_hat, y.dtype) == y
    return float(d2l.reduce_sum(d2l.astype(cmp, y.dtype)))


# Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
A
Aston Zhang 已提交
221
def evaluate_accuracy(net, data_iter):
A
dbl  
Aston Zhang 已提交
222 223 224
    """计算在指定数据集上模型的精度。"""
    metric = Accumulator(2)  # 正确预测数、预测总数
    for _, (X, y) in enumerate(data_iter):
A
Aston Zhang 已提交
225 226 227 228 229
        metric.add(accuracy(net(X), y), d2l.size(y))
    return metric[0] / metric[1]


# Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
A
Aston Zhang 已提交
230
class Accumulator:
A
dbl  
Aston Zhang 已提交
231
    """在`n`个变量上累加。"""
A
Aston Zhang 已提交
232 233 234 235 236 237 238 239 240 241 242 243 244 245
    def __init__(self, n):
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


# Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
A
Aston Zhang 已提交
246
def train_epoch_ch3(net, train_iter, loss, updater):
A
dbl  
Aston Zhang 已提交
247 248
    """训练模型一个迭代周期(定义见第3章)。"""
    # 训练损失总和、训练准确度总和、样本数
A
Aston Zhang 已提交
249 250 251 252
    metric = Accumulator(3)
    if isinstance(updater, gluon.Trainer):
        updater = updater.step
    for X, y in train_iter:
A
dbl  
Aston Zhang 已提交
253
        # 计算梯度并更新参数
A
Aston Zhang 已提交
254 255 256 257 258 259
        with autograd.record():
            y_hat = net(X)
            l = loss(y_hat, y)
        l.backward()
        updater(X.shape[0])
        metric.add(float(l.sum()), accuracy(y_hat, y), y.size)
A
dbl  
Aston Zhang 已提交
260
    # 返回训练损失和训练准确率
A
Aston Zhang 已提交
261 262 263 264
    return metric[0] / metric[2], metric[1] / metric[2]


# Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
A
Aston Zhang 已提交
265
class Animator:
A
dbl  
Aston Zhang 已提交
266
    """在动画中绘制数据。"""
A
Aston Zhang 已提交
267 268 269 270
    def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None,
                 ylim=None, xscale='linear', yscale='linear',
                 fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1,
                 figsize=(3.5, 2.5)):
A
dbl  
Aston Zhang 已提交
271
        # 增量地绘制多条线
A
Aston Zhang 已提交
272 273 274 275 276
        if legend is None:
            legend = []
        d2l.use_svg_display()
        self.fig, self.axes = d2l.plt.subplots(nrows, ncols, figsize=figsize)
        if nrows * ncols == 1:
A
dbl  
Aston Zhang 已提交
277 278 279 280
            self.axes = [self.axes,]
        # 使用lambda函数捕获参数
        self.config_axes = lambda: d2l.set_axes(self.axes[
            0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
A
Aston Zhang 已提交
281 282 283
        self.X, self.Y, self.fmts = None, None, fmts

    def add(self, x, y):
A
dbl  
Aston Zhang 已提交
284
        # 向图表中添加多个数据点
A
Aston Zhang 已提交
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
        if not hasattr(y, "__len__"):
            y = [y]
        n = len(y)
        if not hasattr(x, "__len__"):
            x = [x] * n
        if not self.X:
            self.X = [[] for _ in range(n)]
        if not self.Y:
            self.Y = [[] for _ in range(n)]
        for i, (a, b) in enumerate(zip(x, y)):
            if a is not None and b is not None:
                self.X[i].append(a)
                self.Y[i].append(b)
        self.axes[0].cla()
        for x, y, fmt in zip(self.X, self.Y, self.fmts):
            self.axes[0].plot(x, y, fmt)
        self.config_axes()
        display.display(self.fig)
        display.clear_output(wait=True)


# Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
A
Aston Zhang 已提交
307
def train_ch3(net, train_iter, test_iter, loss, num_epochs, updater):
A
dbl  
Aston Zhang 已提交
308
    """训练模型(定义见第3章)。"""
A
Aston Zhang 已提交
309 310 311 312 313 314 315 316 317 318 319 320 321
    animator = Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0.3, 0.9],
                        legend=['train loss', 'train acc', 'test acc'])
    for epoch in range(num_epochs):
        train_metrics = train_epoch_ch3(net, train_iter, loss, updater)
        test_acc = evaluate_accuracy(net, test_iter)
        animator.add(epoch + 1, train_metrics + (test_acc,))
    train_loss, train_acc = train_metrics
    assert train_loss < 0.5, train_loss
    assert train_acc <= 1 and train_acc > 0.7, train_acc
    assert test_acc <= 1 and test_acc > 0.7, test_acc


# Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
A
Aston Zhang 已提交
322
def predict_ch3(net, test_iter, n=6):
A
dbl  
Aston Zhang 已提交
323
    """预测标签(定义见第3章)。"""
A
Aston Zhang 已提交
324 325 326 327
    for X, y in test_iter:
        break
    trues = d2l.get_fashion_mnist_labels(y)
    preds = d2l.get_fashion_mnist_labels(d2l.argmax(net(X), axis=1))
A
dbl  
Aston Zhang 已提交
328 329 330
    titles = [true + '\n' + pred for true, pred in zip(trues, preds)]
    d2l.show_images(d2l.reshape(X[0:n], (n, 28, 28)), 1, n,
                    titles=titles[0:n])
A
Aston Zhang 已提交
331 332 333


# Defined in file: ./chapter_multilayer-perceptrons/underfit-overfit.md
A
Aston Zhang 已提交
334
def evaluate_loss(net, data_iter, loss):
A
dbl  
Aston Zhang 已提交
335 336
    """评估给定数据集上模型的损失。"""
    metric = d2l.Accumulator(2)  # 损失的总和, 样本数量
A
Aston Zhang 已提交
337 338 339 340 341 342 343
    for X, y in data_iter:
        l = loss(net(X), y)
        metric.add(d2l.reduce_sum(l), d2l.size(l))
    return metric[0] / metric[1]


# Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
A
Aston Zhang 已提交
344 345
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
A
Aston Zhang 已提交
346 347 348


# Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
A
Aston Zhang 已提交
349
def download(name, cache_dir=os.path.join('..', 'data')):
A
dbl  
Aston Zhang 已提交
350 351
    """下载一个DATA_HUB中的文件,返回本地文件名。"""
    assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}."
A
Aston Zhang 已提交
352
    url, sha1_hash = DATA_HUB[name]
A
dbl  
Aston Zhang 已提交
353
    d2l.mkdir_if_not_exist(cache_dir)
A
Aston Zhang 已提交
354 355 356 357 358 359 360 361 362 363 364
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # Hit cache
A
dbl  
Aston Zhang 已提交
365
    print(f'正在从{url}下载{fname}...')
A
Aston Zhang 已提交
366 367 368 369 370 371 372
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname


# Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
A
Aston Zhang 已提交
373
def download_extract(name, folder=None):
A
dbl  
Aston Zhang 已提交
374
    """下载并解压zip/tar文件。"""
A
Aston Zhang 已提交
375 376 377 378 379 380 381 382
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
A
dbl  
Aston Zhang 已提交
383
        assert False, '只有zip/tar文件可以被解压缩。'
A
Aston Zhang 已提交
384 385 386
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

A
dbl  
Aston Zhang 已提交
387

A
Aston Zhang 已提交
388
def download_all():
A
dbl  
Aston Zhang 已提交
389
    """下载DATA_HUB中的所有文件。"""
A
Aston Zhang 已提交
390 391 392 393 394
    for name in DATA_HUB:
        download(name)


# Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
A
dbl  
Aston Zhang 已提交
395 396
DATA_HUB['kaggle_house_train'] = (DATA_URL + 'kaggle_house_pred_train.csv',
                                  '585e9cc93e70b39160e7921475f9bcd7d31219ce')
A
Aston Zhang 已提交
397

A
dbl  
Aston Zhang 已提交
398 399
DATA_HUB['kaggle_house_test'] = (DATA_URL + 'kaggle_house_pred_test.csv',
                                 'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
A
Aston Zhang 已提交
400 401 402


# Defined in file: ./chapter_deep-learning-computation/use-gpu.md
A
Aston Zhang 已提交
403
def try_gpu(i=0):
A
dbl  
Aston Zhang 已提交
404
    """如果存在,则返回gpu(i),否则返回cpu()。"""
A
Aston Zhang 已提交
405 406
    return npx.gpu(i) if npx.num_gpus() >= i + 1 else npx.cpu()

A
dbl  
Aston Zhang 已提交
407

A
Aston Zhang 已提交
408
def try_all_gpus():
A
dbl  
Aston Zhang 已提交
409
    """返回所有可用的GPU,如果没有GPU,则返回[cpu()]。"""
A
Aston Zhang 已提交
410 411 412 413 414
    devices = [npx.gpu(i) for i in range(npx.num_gpus())]
    return devices if devices else [npx.cpu()]


# Defined in file: ./chapter_convolutional-neural-networks/conv-layer.md
A
Aston Zhang 已提交
415
def corr2d(X, K):
A
dbl  
Aston Zhang 已提交
416
    """计算二维互相关运算。"""
A
Aston Zhang 已提交
417 418 419 420
    h, w = K.shape
    Y = d2l.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
A
dbl  
Aston Zhang 已提交
421
            Y[i, j] = d2l.reduce_sum((X[i:i + h, j:j + w] * K))
A
Aston Zhang 已提交
422 423 424 425
    return Y


# Defined in file: ./chapter_convolutional-neural-networks/lenet.md
A
Aston Zhang 已提交
426
def evaluate_accuracy_gpu(net, data_iter, device=None):
A
Aston Zhang 已提交
427
    """Compute the accuracy for a model on a dataset using a GPU."""
A
dbl  
Aston Zhang 已提交
428
    if not device:  # 查询第一个参数所在的第一个设备
A
Aston Zhang 已提交
429
        device = list(net.collect_params().values())[0].list_ctx()[0]
A
dbl  
Aston Zhang 已提交
430
    metric = d2l.Accumulator(2)  # 正确预测的数量,总预测的数量
A
Aston Zhang 已提交
431 432 433
    for X, y in data_iter:
        X, y = X.as_in_ctx(device), y.as_in_ctx(device)
        metric.add(d2l.accuracy(net(X), y), d2l.size(y))
A
Aston Zhang 已提交
434
    return metric[0] / metric[1]
A
Aston Zhang 已提交
435 436 437


# Defined in file: ./chapter_convolutional-neural-networks/lenet.md
A
Aston Zhang 已提交
438
def train_ch6(net, train_iter, test_iter, num_epochs, lr, device):
A
Aston Zhang 已提交
439 440 441
    """Train a model with a GPU (defined in Chapter 6)."""
    net.initialize(force_reinit=True, ctx=device, init=init.Xavier())
    loss = gluon.loss.SoftmaxCrossEntropyLoss()
A
dbl  
Aston Zhang 已提交
442 443
    trainer = gluon.Trainer(net.collect_params(), 'sgd',
                            {'learning_rate': lr})
A
Aston Zhang 已提交
444
    animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs],
A
Aston Zhang 已提交
445
                            legend=['train loss', 'train acc', 'test acc'])
A
Aston Zhang 已提交
446
    timer, num_batches = d2l.Timer(), len(train_iter)
A
Aston Zhang 已提交
447
    for epoch in range(num_epochs):
A
dbl  
Aston Zhang 已提交
448
        metric = d2l.Accumulator(3)  # 训练损失之和,训练准确率之和,范例数
A
Aston Zhang 已提交
449 450
        for i, (X, y) in enumerate(train_iter):
            timer.start()
A
dbl  
Aston Zhang 已提交
451
            # 下面是与“d2l.train_epoch_ch3”的主要不同
A
Aston Zhang 已提交
452 453 454 455 456 457 458 459
            X, y = X.as_in_ctx(device), y.as_in_ctx(device)
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y)
            l.backward()
            trainer.step(X.shape[0])
            metric.add(l.sum(), d2l.accuracy(y_hat, y), X.shape[0])
            timer.stop()
A
Aston Zhang 已提交
460
            train_l = metric[0] / metric[2]
A
Aston Zhang 已提交
461
            train_acc = metric[1] / metric[2]
A
Aston Zhang 已提交
462 463 464
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,
                             (train_l, train_acc, None))
A
Aston Zhang 已提交
465 466
        test_acc = evaluate_accuracy_gpu(net, test_iter)
        animator.add(epoch + 1, (None, None, test_acc))
A
Aston Zhang 已提交
467
    print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, '
A
Aston Zhang 已提交
468 469 470 471 472 473
          f'test acc {test_acc:.3f}')
    print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(device)}')


# Defined in file: ./chapter_convolutional-modern/resnet.md
A
Aston Zhang 已提交
474
class Residual(nn.Block):
A
Aston Zhang 已提交
475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499
    def __init__(self, num_channels, use_1x1conv=False, strides=1, **kwargs):
        super().__init__(**kwargs)
        self.conv1 = nn.Conv2D(num_channels, kernel_size=3, padding=1,
                               strides=strides)
        self.conv2 = nn.Conv2D(num_channels, kernel_size=3, padding=1)
        if use_1x1conv:
            self.conv3 = nn.Conv2D(num_channels, kernel_size=1,
                                   strides=strides)
        else:
            self.conv3 = None
        self.bn1 = nn.BatchNorm()
        self.bn2 = nn.BatchNorm()

    def forward(self, X):
        Y = npx.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        return npx.relu(Y + X)


# Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt',
                                '090b5e7e70c295757f55df93cb0a180b9691891a')

A
dbl  
Aston Zhang 已提交
500

A
Aston Zhang 已提交
501
def read_time_machine():
A
Aston Zhang 已提交
502
    """Load the time machine dataset into a list of text lines."""
A
Aston Zhang 已提交
503 504
    with open(d2l.download('time_machine'), 'r') as f:
        lines = f.readlines()
A
Aston Zhang 已提交
505
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]
A
Aston Zhang 已提交
506 507 508


# Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
A
Aston Zhang 已提交
509
def tokenize(lines, token='word'):
A
Aston Zhang 已提交
510
    """Split text lines into word or character tokens."""
A
Aston Zhang 已提交
511
    if token == 'word':
A
Aston Zhang 已提交
512
        return [line.split() for line in lines]
A
Aston Zhang 已提交
513 514 515
    elif token == 'char':
        return [list(line) for line in lines]
    else:
A
Aston Zhang 已提交
516
        print('ERROR: unknown token type: ' + token)
A
Aston Zhang 已提交
517 518 519


# Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
A
Aston Zhang 已提交
520
class Vocab:
A
Aston Zhang 已提交
521 522 523 524
    """Vocabulary for text."""
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
A
Aston Zhang 已提交
525
        if reserved_tokens is None:
A
dbl  
Aston Zhang 已提交
526
            reserved_tokens = []
A
Aston Zhang 已提交
527 528
        # Sort according to frequencies
        counter = count_corpus(tokens)
A
dbl  
Aston Zhang 已提交
529 530
        self.token_freqs = sorted(counter.items(), key=lambda x: x[0])
        self.token_freqs.sort(key=lambda x: x[1], reverse=True)
A
Aston Zhang 已提交
531
        # The index for the unknown token is 0
A
Aston Zhang 已提交
532
        self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens
A
dbl  
Aston Zhang 已提交
533 534 535
        uniq_tokens += [
            token for token, freq in self.token_freqs
            if freq >= min_freq and token not in uniq_tokens]
A
Aston Zhang 已提交
536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553
        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

A
dbl  
Aston Zhang 已提交
554

A
Aston Zhang 已提交
555
def count_corpus(tokens):
A
Aston Zhang 已提交
556 557 558 559 560
    """Count token frequencies."""
    # Here `tokens` is a 1D list or 2D list
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # Flatten a list of token lists into a list of tokens
        tokens = [token for line in tokens for token in line]
A
Aston Zhang 已提交
561 562 563 564
    return collections.Counter(tokens)


# Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
A
Aston Zhang 已提交
565
def load_corpus_time_machine(max_tokens=-1):
A
Aston Zhang 已提交
566
    """Return token indices and the vocabulary of the time machine dataset."""
A
Aston Zhang 已提交
567 568 569
    lines = read_time_machine()
    tokens = tokenize(lines, 'char')
    vocab = Vocab(tokens)
A
Aston Zhang 已提交
570 571 572
    # Since each text line in the time machine dataset is not necessarily a
    # sentence or a paragraph, flatten all the text lines into a single list
    corpus = [vocab[token] for line in tokens for token in line]
A
Aston Zhang 已提交
573 574 575 576 577 578
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab


# Defined in file: ./chapter_recurrent-neural-networks/language-models-and-dataset.md
A
Aston Zhang 已提交
579
def seq_data_iter_random(corpus, batch_size, num_steps):
A
Aston Zhang 已提交
580
    """Generate a minibatch of subsequences using random sampling."""
A
dbl  
Aston Zhang 已提交
581 582
    # Start with a random offset to partition a sequence
    corpus = corpus[random.randint(0, num_steps):]
A
Aston Zhang 已提交
583 584 585 586 587 588 589 590
    # Subtract 1 since we need to account for labels
    num_subseqs = (len(corpus) - 1) // num_steps
    # The starting indices for subsequences of length `num_steps`
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    # In random sampling, the subsequences from two adjacent random
    # minibatches during iteration are not necessarily adjacent on the
    # original sequence
    random.shuffle(initial_indices)
A
Aston Zhang 已提交
591 592

    def data(pos):
A
Aston Zhang 已提交
593
        # Return a sequence of length `num_steps` starting from `pos`
A
dbl  
Aston Zhang 已提交
594
        return corpus[pos:pos + num_steps]
A
Aston Zhang 已提交
595

A
dbl  
Aston Zhang 已提交
596 597
    num_subseqs_per_example = num_subseqs // batch_size
    for i in range(0, batch_size * num_subseqs_per_example, batch_size):
A
Aston Zhang 已提交
598 599
        # Here, `initial_indices` contains randomized starting indices for
        # subsequences
A
dbl  
Aston Zhang 已提交
600
        initial_indices_per_batch = initial_indices[i:i + batch_size]
A
Aston Zhang 已提交
601 602
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j + 1) for j in initial_indices_per_batch]
A
Aston Zhang 已提交
603 604 605 606
        yield d2l.tensor(X), d2l.tensor(Y)


# Defined in file: ./chapter_recurrent-neural-networks/language-models-and-dataset.md
A
Aston Zhang 已提交
607
def seq_data_iter_sequential(corpus, batch_size, num_steps):
A
Aston Zhang 已提交
608 609
    """Generate a minibatch of subsequences using sequential partitioning."""
    # Start with a random offset to partition a sequence
A
Aston Zhang 已提交
610
    offset = random.randint(0, num_steps)
A
Aston Zhang 已提交
611
    num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
A
dbl  
Aston Zhang 已提交
612 613
    Xs = d2l.tensor(corpus[offset:offset + num_tokens])
    Ys = d2l.tensor(corpus[offset + 1:offset + 1 + num_tokens])
A
Aston Zhang 已提交
614 615
    Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
    num_batches = Xs.shape[1] // num_steps
A
dbl  
Aston Zhang 已提交
616 617 618
    for i in range(0, num_batches * num_steps, num_steps):
        X = Xs[:, i:i + num_steps]
        Y = Ys[:, i:i + num_steps]
A
Aston Zhang 已提交
619 620 621 622
        yield X, Y


# Defined in file: ./chapter_recurrent-neural-networks/language-models-and-dataset.md
A
Aston Zhang 已提交
623
class SeqDataLoader:
A
Aston Zhang 已提交
624 625 626 627 628
    """An iterator to load sequence data."""
    def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
        if use_random_iter:
            self.data_iter_fn = d2l.seq_data_iter_random
        else:
A
Aston Zhang 已提交
629
            self.data_iter_fn = d2l.seq_data_iter_sequential
A
Aston Zhang 已提交
630 631 632 633 634 635 636 637
        self.corpus, self.vocab = d2l.load_corpus_time_machine(max_tokens)
        self.batch_size, self.num_steps = batch_size, num_steps

    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)


# Defined in file: ./chapter_recurrent-neural-networks/language-models-and-dataset.md
A
dbl  
Aston Zhang 已提交
638 639
def load_data_time_machine(batch_size, num_steps, use_random_iter=False,
                           max_tokens=10000):
A
Aston Zhang 已提交
640
    """Return the iterator and the vocabulary of the time machine dataset."""
A
dbl  
Aston Zhang 已提交
641 642
    data_iter = SeqDataLoader(batch_size, num_steps, use_random_iter,
                              max_tokens)
A
Aston Zhang 已提交
643 644 645 646
    return data_iter, data_iter.vocab


# Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md
A
Aston Zhang 已提交
647
class RNNModelScratch:
A
Aston Zhang 已提交
648 649 650
    """An RNN Model implemented from scratch."""
    def __init__(self, vocab_size, num_hiddens, device, get_params,
                 init_state, forward_fn):
A
Aston Zhang 已提交
651 652
        self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
        self.params = get_params(vocab_size, num_hiddens, device)
A
Aston Zhang 已提交
653
        self.init_state, self.forward_fn = init_state, forward_fn
A
Aston Zhang 已提交
654 655 656 657 658 659 660 661 662 663

    def __call__(self, X, state):
        X = npx.one_hot(X.T, self.vocab_size)
        return self.forward_fn(X, state, self.params)

    def begin_state(self, batch_size, ctx):
        return self.init_state(batch_size, self.num_hiddens, ctx)


# Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md
A
dbl  
Aston Zhang 已提交
664
def predict_ch8(prefix, num_preds, model, vocab, device):
A
Aston Zhang 已提交
665
    """Generate new characters following the `prefix`."""
A
dbl  
Aston Zhang 已提交
666
    state = model.begin_state(batch_size=1, ctx=device)
A
Aston Zhang 已提交
667
    outputs = [vocab[prefix[0]]]
A
dbl  
Aston Zhang 已提交
668 669
    get_input = lambda: d2l.reshape(d2l.tensor([outputs[-1]], ctx=device),
                                    (1, 1))
A
Aston Zhang 已提交
670
    for y in prefix[1:]:  # Warm-up period
A
dbl  
Aston Zhang 已提交
671
        _, state = model(get_input(), state)
A
Aston Zhang 已提交
672
        outputs.append(vocab[y])
A
Aston Zhang 已提交
673
    for _ in range(num_preds):  # Predict `num_preds` steps
A
dbl  
Aston Zhang 已提交
674
        y, state = model(get_input(), state)
A
Aston Zhang 已提交
675
        outputs.append(int(y.argmax(axis=1).reshape(1)))
A
Aston Zhang 已提交
676 677 678 679
    return ''.join([vocab.idx_to_token[i] for i in outputs])


# Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md
A
dbl  
Aston Zhang 已提交
680
def grad_clipping(model, theta):
A
Aston Zhang 已提交
681
    """Clip the gradient."""
A
dbl  
Aston Zhang 已提交
682 683
    if isinstance(model, gluon.Block):
        params = [p.data() for p in model.collect_params().values()]
A
Aston Zhang 已提交
684
    else:
A
dbl  
Aston Zhang 已提交
685 686
        params = model.params
    norm = math.sqrt(sum((p.grad**2).sum() for p in params))
A
Aston Zhang 已提交
687 688 689 690 691 692
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm


# Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md
A
dbl  
Aston Zhang 已提交
693 694
def train_epoch_ch8(model, train_iter, loss, updater, device,
                    use_random_iter):
A
Aston Zhang 已提交
695
    """Train a model within one epoch (defined in Chapter 8)."""
A
Aston Zhang 已提交
696
    state, timer = None, d2l.Timer()
A
Aston Zhang 已提交
697
    metric = d2l.Accumulator(2)  # Sum of training loss, no. of tokens
A
Aston Zhang 已提交
698 699
    for X, Y in train_iter:
        if state is None or use_random_iter:
A
Aston Zhang 已提交
700 701
            # Initialize `state` when either it is the first iteration or
            # using random sampling
A
dbl  
Aston Zhang 已提交
702
            state = model.begin_state(batch_size=X.shape[0], ctx=device)
A
Aston Zhang 已提交
703 704 705 706 707 708
        else:
            for s in state:
                s.detach()
        y = Y.T.reshape(-1)
        X, y = X.as_in_ctx(device), y.as_in_ctx(device)
        with autograd.record():
A
dbl  
Aston Zhang 已提交
709
            y_hat, state = model(X, state)
A
Aston Zhang 已提交
710
            l = loss(y_hat, y).mean()
A
Aston Zhang 已提交
711
        l.backward()
A
dbl  
Aston Zhang 已提交
712
        grad_clipping(model, 1)
A
Aston Zhang 已提交
713
        updater(batch_size=1)  # Since the `mean` function has been invoked
A
Aston Zhang 已提交
714
        metric.add(l * d2l.size(y), d2l.size(y))
A
Aston Zhang 已提交
715
    return math.exp(metric[0] / metric[1]), metric[1] / timer.stop()
A
Aston Zhang 已提交
716 717 718


# Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md
A
dbl  
Aston Zhang 已提交
719
def train_ch8(model, train_iter, vocab, lr, num_epochs, device,
A
Aston Zhang 已提交
720
              use_random_iter=False):
A
Aston Zhang 已提交
721
    """Train a model (defined in Chapter 8)."""
A
Aston Zhang 已提交
722 723
    loss = gluon.loss.SoftmaxCrossEntropyLoss()
    animator = d2l.Animator(xlabel='epoch', ylabel='perplexity',
A
Aston Zhang 已提交
724
                            legend=['train'], xlim=[10, num_epochs])
A
Aston Zhang 已提交
725
    # Initialize
A
dbl  
Aston Zhang 已提交
726 727
    if isinstance(model, gluon.Block):
        model.initialize(ctx=device, force_reinit=True,
A
Aston Zhang 已提交
728
                         init=init.Normal(0.01))
A
dbl  
Aston Zhang 已提交
729 730
        trainer = gluon.Trainer(model.collect_params(), 'sgd',
                                {'learning_rate': lr})
A
Aston Zhang 已提交
731 732
        updater = lambda batch_size: trainer.step(batch_size)
    else:
A
dbl  
Aston Zhang 已提交
733 734
        updater = lambda batch_size: d2l.sgd(model.params, lr, batch_size)
    predict = lambda prefix: predict_ch8(prefix, 50, model, vocab, device)
A
Aston Zhang 已提交
735
    # Train and predict
A
Aston Zhang 已提交
736
    for epoch in range(num_epochs):
A
dbl  
Aston Zhang 已提交
737 738
        ppl, speed = train_epoch_ch8(model, train_iter, loss, updater, device,
                                     use_random_iter)
A
Aston Zhang 已提交
739
        if (epoch + 1) % 10 == 0:
A
Aston Zhang 已提交
740
            animator.add(epoch + 1, [ppl])
A
Aston Zhang 已提交
741 742 743 744 745 746 747
    print(f'perplexity {ppl:.1f}, {speed:.1f} tokens/sec on {str(device)}')
    print(predict('time traveller'))
    print(predict('traveller'))


# Defined in file: ./chapter_recurrent-neural-networks/rnn-concise.md
class RNNModel(nn.Block):
A
Aston Zhang 已提交
748
    """The RNN model."""
A
Aston Zhang 已提交
749 750 751 752 753 754 755 756 757
    def __init__(self, rnn_layer, vocab_size, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        self.rnn = rnn_layer
        self.vocab_size = vocab_size
        self.dense = nn.Dense(vocab_size)

    def forward(self, inputs, state):
        X = npx.one_hot(inputs.T, self.vocab_size)
        Y, state = self.rnn(X, state)
A
Aston Zhang 已提交
758
        # The fully-connected layer will first change the shape of `Y` to
A
Aston Zhang 已提交
759 760 761 762 763 764 765 766 767
        # (`num_steps` * `batch_size`, `num_hiddens`). Its output shape is
        # (`num_steps` * `batch_size`, `vocab_size`).
        output = self.dense(Y.reshape(-1, Y.shape[-1]))
        return output, state

    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)


A
dbl  
Aston Zhang 已提交
768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250
# Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md
d2l.DATA_HUB['fra-eng'] = (d2l.DATA_URL + 'fra-eng.zip',
                           '94646ad1522d915e7b0f9296181140edcf86a4f5')


def read_data_nmt():
    """Load the English-French dataset."""
    data_dir = d2l.download_extract('fra-eng')
    with open(os.path.join(data_dir, 'fra.txt'), 'r') as f:
        return f.read()


# Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md
def preprocess_nmt(text):
    """Preprocess the English-French dataset."""
    def no_space(char, prev_char):
        return char in set(',.!?') and prev_char != ' '

    # Replace non-breaking space with space, and convert uppercase letters to
    # lowercase ones
    text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower()
    # Insert space between words and punctuation marks
    out = [
        ' ' + char if i > 0 and no_space(char, text[i - 1]) else char
        for i, char in enumerate(text)]
    return ''.join(out)


# Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md
def tokenize_nmt(text, num_examples=None):
    """Tokenize the English-French dataset."""
    source, target = [], []
    for i, line in enumerate(text.split('\n')):
        if num_examples and i > num_examples:
            break
        parts = line.split('\t')
        if len(parts) == 2:
            source.append(parts[0].split(' '))
            target.append(parts[1].split(' '))
    return source, target


# Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md
def truncate_pad(line, num_steps, padding_token):
    """Truncate or pad sequences."""
    if len(line) > num_steps:
        return line[:num_steps]  # Truncate
    return line + [padding_token] * (num_steps - len(line))  # Pad


# Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md
def build_array_nmt(lines, vocab, num_steps):
    """Transform text sequences of machine translation into minibatches."""
    lines = [vocab[l] for l in lines]
    lines = [l + [vocab['<eos>']] for l in lines]
    array = d2l.tensor([
        truncate_pad(l, num_steps, vocab['<pad>']) for l in lines])
    valid_len = d2l.reduce_sum(d2l.astype(array != vocab['<pad>'], d2l.int32),
                               1)
    return array, valid_len


# Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md
def load_data_nmt(batch_size, num_steps, num_examples=600):
    """Return the iterator and the vocabularies of the translation dataset."""
    text = preprocess_nmt(read_data_nmt())
    source, target = tokenize_nmt(text, num_examples)
    src_vocab = d2l.Vocab(source, min_freq=2,
                          reserved_tokens=['<pad>', '<bos>', '<eos>'])
    tgt_vocab = d2l.Vocab(target, min_freq=2,
                          reserved_tokens=['<pad>', '<bos>', '<eos>'])
    src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps)
    tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps)
    data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len)
    data_iter = d2l.load_array(data_arrays, batch_size)
    return data_iter, src_vocab, tgt_vocab


# Defined in file: ./chapter_recurrent-modern/encoder-decoder.md
class Encoder(nn.Block):
    """The base encoder interface for the encoder-decoder architecture."""
    def __init__(self, **kwargs):
        super(Encoder, self).__init__(**kwargs)

    def forward(self, X, *args):
        raise NotImplementedError


# Defined in file: ./chapter_recurrent-modern/encoder-decoder.md
class Decoder(nn.Block):
    """The base decoder interface for the encoder-decoder architecture."""
    def __init__(self, **kwargs):
        super(Decoder, self).__init__(**kwargs)

    def init_state(self, enc_outputs, *args):
        raise NotImplementedError

    def forward(self, X, state):
        raise NotImplementedError


# Defined in file: ./chapter_recurrent-modern/encoder-decoder.md
class EncoderDecoder(nn.Block):
    """The base class for the encoder-decoder architecture."""
    def __init__(self, encoder, decoder, **kwargs):
        super(EncoderDecoder, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, enc_X, dec_X, *args):
        enc_outputs = self.encoder(enc_X, *args)
        dec_state = self.decoder.init_state(enc_outputs, *args)
        return self.decoder(dec_X, dec_state)


# Defined in file: ./chapter_recurrent-modern/seq2seq.md
class Seq2SeqEncoder(d2l.Encoder):
    """The RNN encoder for sequence to sequence learning."""
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0, **kwargs):
        super(Seq2SeqEncoder, self).__init__(**kwargs)
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = rnn.GRU(num_hiddens, num_layers, dropout=dropout)

    def forward(self, X, *args):
        # The output `X` shape: (`batch_size`, `num_steps`, `embed_size`)
        X = self.embedding(X)
        # In RNN models, the first axis corresponds to time steps
        X = X.swapaxes(0, 1)
        state = self.rnn.begin_state(batch_size=X.shape[1], ctx=X.ctx)
        output, state = self.rnn(X, state)
        # `output` shape: (`num_steps`, `batch_size`, `num_hiddens`)
        # `state[0]` shape: (`num_layers`, `batch_size`, `num_hiddens`)
        return output, state


# Defined in file: ./chapter_recurrent-modern/seq2seq.md
class MaskedSoftmaxCELoss(gluon.loss.SoftmaxCELoss):
    """The softmax cross-entropy loss with masks."""

    # `pred` shape: (`batch_size`, `num_steps`, `vocab_size`)
    # `label` shape: (`batch_size`, `num_steps`)
    # `valid_len` shape: (`batch_size`,)
    def forward(self, pred, label, valid_len):
        # `weights` shape: (`batch_size`, `num_steps`, 1)
        weights = np.expand_dims(np.ones_like(label), axis=-1)
        weights = npx.sequence_mask(weights, valid_len, True, axis=1)
        return super(MaskedSoftmaxCELoss, self).forward(pred, label, weights)


# Defined in file: ./chapter_recurrent-modern/seq2seq.md
def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device):
    """Train a model for sequence to sequence."""
    net.initialize(init.Xavier(), force_reinit=True, ctx=device)
    trainer = gluon.Trainer(net.collect_params(), 'adam',
                            {'learning_rate': lr})
    loss = MaskedSoftmaxCELoss()
    animator = d2l.Animator(xlabel='epoch', ylabel='loss',
                            xlim=[10, num_epochs])
    for epoch in range(num_epochs):
        timer = d2l.Timer()
        metric = d2l.Accumulator(2)  # Sum of training loss, no. of tokens
        for batch in data_iter:
            X, X_valid_len, Y, Y_valid_len = [
                x.as_in_ctx(device) for x in batch]
            bos = np.array([tgt_vocab['<bos>']] * Y.shape[0],
                           ctx=device).reshape(-1, 1)
            dec_input = d2l.concat([bos, Y[:, :-1]], 1)  # Teacher forcing
            with autograd.record():
                Y_hat, _ = net(X, dec_input, X_valid_len)
                l = loss(Y_hat, Y, Y_valid_len)
            l.backward()
            d2l.grad_clipping(net, 1)
            num_tokens = Y_valid_len.sum()
            trainer.step(num_tokens)
            metric.add(l.sum(), num_tokens)
        if (epoch + 1) % 10 == 0:
            animator.add(epoch + 1, (metric[0] / metric[1],))
    print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} '
          f'tokens/sec on {str(device)}')


# Defined in file: ./chapter_recurrent-modern/seq2seq.md
def predict_seq2seq(net, src_sentence, src_vocab, tgt_vocab, num_steps,
                    device, save_attention_weights=False):
    """Predict for sequence to sequence."""
    src_tokens = src_vocab[src_sentence.lower().split(' ')] + [
        src_vocab['<eos>']]
    enc_valid_len = np.array([len(src_tokens)], ctx=device)
    src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['<pad>'])
    # Add the batch axis
    enc_X = np.expand_dims(np.array(src_tokens, ctx=device), axis=0)
    enc_outputs = net.encoder(enc_X, enc_valid_len)
    dec_state = net.decoder.init_state(enc_outputs, enc_valid_len)
    # Add the batch axis
    dec_X = np.expand_dims(np.array([tgt_vocab['<bos>']], ctx=device), axis=0)
    output_seq, attention_weight_seq = [], []
    for _ in range(num_steps):
        Y, dec_state = net.decoder(dec_X, dec_state)
        # We use the token with the highest prediction likelihood as the input
        # of the decoder at the next time step
        dec_X = Y.argmax(axis=2)
        pred = dec_X.squeeze(axis=0).astype('int32').item()
        # Save attention weights (to be covered later)
        if save_attention_weights:
            attention_weight_seq.append(net.decoder.attention_weights)
        # Once the end-of-sequence token is predicted, the generation of the
        # output sequence is complete
        if pred == tgt_vocab['<eos>']:
            break
        output_seq.append(pred)
    return ' '.join(tgt_vocab.to_tokens(output_seq)), attention_weight_seq


# Defined in file: ./chapter_recurrent-modern/seq2seq.md
def bleu(pred_seq, label_seq, k):
    """Compute the BLEU."""
    pred_tokens, label_tokens = pred_seq.split(' '), label_seq.split(' ')
    len_pred, len_label = len(pred_tokens), len(label_tokens)
    score = math.exp(min(0, 1 - len_label / len_pred))
    for n in range(1, k + 1):
        num_matches, label_subs = 0, collections.defaultdict(int)
        for i in range(len_label - n + 1):
            label_subs[''.join(label_tokens[i:i + n])] += 1
        for i in range(len_pred - n + 1):
            if label_subs[''.join(pred_tokens[i:i + n])] > 0:
                num_matches += 1
                label_subs[''.join(pred_tokens[i:i + n])] -= 1
        score *= math.pow(num_matches / (len_pred - n + 1), math.pow(0.5, n))
    return score


# Defined in file: ./chapter_attention-mechanisms/attention-cues.md
def show_heatmaps(matrices, xlabel, ylabel, titles=None, figsize=(2.5, 2.5),
                  cmap='Reds'):
    d2l.use_svg_display()
    num_rows, num_cols = matrices.shape[0], matrices.shape[1]
    fig, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize,
                                 sharex=True, sharey=True, squeeze=False)
    for i, (row_axes, row_matrices) in enumerate(zip(axes, matrices)):
        for j, (ax, matrix) in enumerate(zip(row_axes, row_matrices)):
            pcm = ax.imshow(d2l.numpy(matrix), cmap=cmap)
            if i == num_rows - 1:
                ax.set_xlabel(xlabel)
            if j == 0:
                ax.set_ylabel(ylabel)
            if titles:
                ax.set_title(titles[j])
    fig.colorbar(pcm, ax=axes, shrink=0.6)


# Defined in file: ./chapter_attention-mechanisms/attention-scoring-functions.md
def masked_softmax(X, valid_lens):
    """Perform softmax operation by masking elements on the last axis."""
    # `X`: 3D tensor, `valid_lens`: 1D or 2D tensor
    if valid_lens is None:
        return npx.softmax(X)
    else:
        shape = X.shape
        if valid_lens.ndim == 1:
            valid_lens = valid_lens.repeat(shape[1])
        else:
            valid_lens = valid_lens.reshape(-1)
        # On the last axis, replace masked elements with a very large negative
        # value, whose exponentiation outputs 0
        X = npx.sequence_mask(X.reshape(-1, shape[-1]), valid_lens, True,
                              value=-1e6, axis=1)
        return npx.softmax(X).reshape(shape)


# Defined in file: ./chapter_attention-mechanisms/attention-scoring-functions.md
class AdditiveAttention(nn.Block):
    """Additive attention."""
    def __init__(self, num_hiddens, dropout, **kwargs):
        super(AdditiveAttention, self).__init__(**kwargs)
        # Use `flatten=False` to only transform the last axis so that the
        # shapes for the other axes are kept the same
        self.W_k = nn.Dense(num_hiddens, use_bias=False, flatten=False)
        self.W_q = nn.Dense(num_hiddens, use_bias=False, flatten=False)
        self.w_v = nn.Dense(1, use_bias=False, flatten=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, queries, keys, values, valid_lens):
        queries, keys = self.W_q(queries), self.W_k(keys)
        # After dimension expansion, shape of `queries`: (`batch_size`, no. of
        # queries, 1, `num_hiddens`) and shape of `keys`: (`batch_size`, 1,
        # no. of key-value pairs, `num_hiddens`). Sum them up with
        # broadcasting
        features = np.expand_dims(queries, axis=2) + np.expand_dims(
            keys, axis=1)
        features = np.tanh(features)
        # There is only one output of `self.w_v`, so we remove the last
        # one-dimensional entry from the shape. Shape of `scores`:
        # (`batch_size`, no. of queries, no. of key-value pairs)
        scores = np.squeeze(self.w_v(features), axis=-1)
        self.attention_weights = masked_softmax(scores, valid_lens)
        # Shape of `values`: (`batch_size`, no. of key-value pairs, value
        # dimension)
        return npx.batch_dot(self.dropout(self.attention_weights), values)


# Defined in file: ./chapter_attention-mechanisms/attention-scoring-functions.md
class DotProductAttention(nn.Block):
    """Scaled dot product attention."""
    def __init__(self, dropout, **kwargs):
        super(DotProductAttention, self).__init__(**kwargs)
        self.dropout = nn.Dropout(dropout)

    # Shape of `queries`: (`batch_size`, no. of queries, `d`)
    # Shape of `keys`: (`batch_size`, no. of key-value pairs, `d`)
    # Shape of `values`: (`batch_size`, no. of key-value pairs, value
    # dimension)
    # Shape of `valid_lens`: (`batch_size`,) or (`batch_size`, no. of queries)
    def forward(self, queries, keys, values, valid_lens=None):
        d = queries.shape[-1]
        # Set `transpose_b=True` to swap the last two dimensions of `keys`
        scores = npx.batch_dot(queries, keys, transpose_b=True) / math.sqrt(d)
        self.attention_weights = masked_softmax(scores, valid_lens)
        return npx.batch_dot(self.dropout(self.attention_weights), values)


# Defined in file: ./chapter_attention-mechanisms/bahdanau-attention.md
class AttentionDecoder(d2l.Decoder):
    """The base attention-based decoder interface."""
    def __init__(self, **kwargs):
        super(AttentionDecoder, self).__init__(**kwargs)

    @property
    def attention_weights(self):
        raise NotImplementedError


# Defined in file: ./chapter_attention-mechanisms/multihead-attention.md
class MultiHeadAttention(nn.Block):
    def __init__(self, num_hiddens, num_heads, dropout, use_bias=False,
                 **kwargs):
        super(MultiHeadAttention, self).__init__(**kwargs)
        self.num_heads = num_heads
        self.attention = d2l.DotProductAttention(dropout)
        self.W_q = nn.Dense(num_hiddens, use_bias=use_bias, flatten=False)
        self.W_k = nn.Dense(num_hiddens, use_bias=use_bias, flatten=False)
        self.W_v = nn.Dense(num_hiddens, use_bias=use_bias, flatten=False)
        self.W_o = nn.Dense(num_hiddens, use_bias=use_bias, flatten=False)

    def forward(self, queries, keys, values, valid_lens):
        # Shape of `queries`, `keys`, or `values`:
        # (`batch_size`, no. of queries or key-value pairs, `num_hiddens`)
        # Shape of `valid_lens`:
        # (`batch_size`,) or (`batch_size`, no. of queries)
        # After transposing, shape of output `queries`, `keys`, or `values`:
        # (`batch_size` * `num_heads`, no. of queries or key-value pairs,
        # `num_hiddens` / `num_heads`)
        queries = transpose_qkv(self.W_q(queries), self.num_heads)
        keys = transpose_qkv(self.W_k(keys), self.num_heads)
        values = transpose_qkv(self.W_v(values), self.num_heads)

        if valid_lens is not None:
            # On axis 0, copy the first item (scalar or vector) for
            # `num_heads` times, then copy the next item, and so on
            valid_lens = valid_lens.repeat(self.num_heads, axis=0)

        # Shape of `output`: (`batch_size` * `num_heads`, no. of queries,
        # `num_hiddens` / `num_heads`)
        output = self.attention(queries, keys, values, valid_lens)

        # Shape of `output_concat`:
        # (`batch_size`, no. of queries, `num_hiddens`)
        output_concat = transpose_output(output, self.num_heads)
        return self.W_o(output_concat)


# Defined in file: ./chapter_attention-mechanisms/multihead-attention.md
def transpose_qkv(X, num_heads):
    # Shape of input `X`:
    # (`batch_size`, no. of queries or key-value pairs, `num_hiddens`).
    # Shape of output `X`:
    # (`batch_size`, no. of queries or key-value pairs, `num_heads`,
    # `num_hiddens` / `num_heads`)
    X = X.reshape(X.shape[0], X.shape[1], num_heads, -1)

    # Shape of output `X`:
    # (`batch_size`, `num_heads`, no. of queries or key-value pairs,
    # `num_hiddens` / `num_heads`)
    X = X.transpose(0, 2, 1, 3)

    # Shape of `output`:
    # (`batch_size` * `num_heads`, no. of queries or key-value pairs,
    # `num_hiddens` / `num_heads`)
    return X.reshape(-1, X.shape[2], X.shape[3])


def transpose_output(X, num_heads):
    """Reverse the operation of `transpose_qkv`"""
    X = X.reshape(-1, num_heads, X.shape[1], X.shape[2])
    X = X.transpose(0, 2, 1, 3)
    return X.reshape(X.shape[0], X.shape[1], -1)


# Defined in file: ./chapter_attention-mechanisms/self-attention-and-positional-encoding.md
class PositionalEncoding(nn.Block):
    def __init__(self, num_hiddens, dropout, max_len=1000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        # Create a long enough `P`
        self.P = d2l.zeros((1, max_len, num_hiddens))
        X = d2l.arange(max_len).reshape(-1, 1) / np.power(
            10000,
            np.arange(0, num_hiddens, 2) / num_hiddens)
        self.P[:, :, 0::2] = np.sin(X)
        self.P[:, :, 1::2] = np.cos(X)

    def forward(self, X):
        X = X + self.P[:, :X.shape[1], :].as_in_ctx(X.ctx)
        return self.dropout(X)


# Defined in file: ./chapter_attention-mechanisms/transformer.md
class PositionWiseFFN(nn.Block):
    def __init__(self, ffn_num_hiddens, ffn_num_outputs, **kwargs):
        super(PositionWiseFFN, self).__init__(**kwargs)
        self.dense1 = nn.Dense(ffn_num_hiddens, flatten=False,
                               activation='relu')
        self.dense2 = nn.Dense(ffn_num_outputs, flatten=False)

    def forward(self, X):
        return self.dense2(self.dense1(X))


# Defined in file: ./chapter_attention-mechanisms/transformer.md
class AddNorm(nn.Block):
    def __init__(self, dropout, **kwargs):
        super(AddNorm, self).__init__(**kwargs)
        self.dropout = nn.Dropout(dropout)
        self.ln = nn.LayerNorm()

    def forward(self, X, Y):
        return self.ln(self.dropout(Y) + X)


# Defined in file: ./chapter_attention-mechanisms/transformer.md
class EncoderBlock(nn.Block):
    def __init__(self, num_hiddens, ffn_num_hiddens, num_heads, dropout,
                 use_bias=False, **kwargs):
        super(EncoderBlock, self).__init__(**kwargs)
        self.attention = d2l.MultiHeadAttention(num_hiddens, num_heads,
                                                dropout, use_bias)
        self.addnorm1 = AddNorm(dropout)
        self.ffn = PositionWiseFFN(ffn_num_hiddens, num_hiddens)
        self.addnorm2 = AddNorm(dropout)

    def forward(self, X, valid_lens):
        Y = self.addnorm1(X, self.attention(X, X, X, valid_lens))
        return self.addnorm2(Y, self.ffn(Y))


# Defined in file: ./chapter_attention-mechanisms/transformer.md
class TransformerEncoder(d2l.Encoder):
    def __init__(self, vocab_size, num_hiddens, ffn_num_hiddens, num_heads,
                 num_layers, dropout, use_bias=False, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.embedding = nn.Embedding(vocab_size, num_hiddens)
        self.pos_encoding = d2l.PositionalEncoding(num_hiddens, dropout)
        self.blks = nn.Sequential()
        for _ in range(num_layers):
            self.blks.add(
                EncoderBlock(num_hiddens, ffn_num_hiddens, num_heads, dropout,
                             use_bias))

    def forward(self, X, valid_lens, *args):
        # Since positional encoding values are between -1 and 1, the embedding
        # values are multiplied by the square root of the embedding dimension
        # to rescale before they are summed up
        X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens))
        self.attention_weights = [None] * len(self.blks)
        for i, blk in enumerate(self.blks):
            X = blk(X, valid_lens)
            self.attention_weights[
                i] = blk.attention.attention.attention_weights
        return X


A
Aston Zhang 已提交
1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268
# Alias defined in config.ini
size = lambda a: a.size
transpose = lambda a: a.T

ones = np.ones
zeros = np.zeros
arange = np.arange
meshgrid = np.meshgrid
sin = np.sin
sinh = np.sinh
cos = np.cos
cosh = np.cosh
tanh = np.tanh
linspace = np.linspace
exp = np.exp
log = np.log
tensor = np.array
normal = np.random.normal
A
Aston Zhang 已提交
1269
rand = np.random.rand
A
Aston Zhang 已提交
1270 1271 1272 1273 1274 1275
matmul = np.dot
int32 = np.int32
float32 = np.float32
concat = np.concatenate
stack = np.stack
abs = np.abs
A
Aston Zhang 已提交
1276
eye = np.eye
A
Aston Zhang 已提交
1277 1278 1279 1280 1281 1282
numpy = lambda x, *args, **kwargs: x.asnumpy(*args, **kwargs)
reshape = lambda x, *args, **kwargs: x.reshape(*args, **kwargs)
to = lambda x, *args, **kwargs: x.as_in_context(*args, **kwargs)
reduce_sum = lambda x, *args, **kwargs: x.sum(*args, **kwargs)
argmax = lambda x, *args, **kwargs: x.argmax(*args, **kwargs)
astype = lambda x, *args, **kwargs: x.astype(*args, **kwargs)
M
Mu Li 已提交
1283