Merge pull request #1 from d2l-ai/master

Update

Merge pull request #1 from d2l-ai/master
Update
43abdc37 · PEGASUS · GitHub · 8a2532cc · bd4917d8 · 43abdc37
46 changed file
--- a/.gitmodules
+++ b/.gitmodules
 [submodule "build/mx-theme"]
 	path = build/mx-theme
 	url = https://github.com/mli/mx-theme
+[submodule "build/utils"]
+	path = build/utils
+	url = https://github.com/d2l-ai/utils
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -2,14 +2,15 @@ stage("Build and Publish") {
  node {
    ws('workspace/d2l-zh') {
 	  checkout scm
-      sh "build/build_all.sh"
-      sh """#!/bin/bash
-      set -e
-      if [[ ${env.BRANCH_NAME} == master ]]; then
-          build/upload.sh
-      fi
-      """
-	}
+      sh "git submodule update --init --recursive"
+      sh "build/utils/clean_build.sh"
+      sh "conda env update -f build/env.yml"
+      sh "build/utils/build_html.sh zh"
+      sh "build/utils/build_pdf.sh zh"
+      sh "build/utils/build_pkg.sh zh"
+      if (env.BRANCH_NAME == 'master') {
+        sh "build/utils/publish_website.sh zh"
+      }
+    }
  }
 }
-
--- a/Makefile
+++ b/Makefile
 all: html

-build/%.ipynb: %.md build/env.yml build/md2ipynb.py $(wildcard gluonbook/*)
+build/%.ipynb: %.md build/env.yml $(wildcard gluonbook/*)
 	@mkdir -p $(@D)
-	cd $(@D); python ../md2ipynb.py ../../$< ../../$@
+	cd $(@D); python ../utils/md2ipynb.py ../../$< ../../$@

 build/%.md: %.md
 	@mkdir -p $(@D)
@@ -62,10 +62,9 @@ pdf: $(DEPS) $(OBJ) $(PDFIMG)
 	sed -i /\\\\sphinxtablecontinued{Continued\ on\ next\ page}/d $(TEX)
 	sed -i /{\\\\tablename\\\\\ \\\\thetable{}\ --\ continued\ from\ previous\ page}/d $(TEX)
 	cd build/_build/latex && \
-	bash ../../convert_output_svg.sh && \
+	bash ../../utils/convert_output_svg.sh && \
 	buf_size=10000000 xelatex d2l-zh.tex && \
 	buf_size=10000000 xelatex d2l-zh.tex

 clean:
 	rm -rf build/chapter* build/_build build/img build/data build/environment.yml build/README.md $(PKG)
-
--- a/build/_static/frontpage.css
+++ b/build/_static/frontpage.css
@@ -15,6 +15,22 @@ h2, h3, h4 {
    padding: 150px 0 0 0;
 }

+.endorsement h3 {
+    padding-top: 1.5em;
+    font-size: 1.4em;
+}
+
+.endorsement p {
+    line-height: 2em;
+    /* text-align: right; */
+}
+
+.endorsement .author {
+    font-weight: 500;
+    font-size: 1.4em;
+    margin: 0;
+}
+
 h2.toc {
    padding: 150px 0 0 0;
 }

--- a/build/build_all.sh
+++ b/build/build_all.sh
-#!/bin/bash
-set -ex
-
-tik=$(date +%s)
-
-[ -e build/data-bak ] && rm -rf build/data-bak
-
-# Clean build/chapter*/*ipynb and build/chapter*/*md that are no longer needed.
-cd build
-for ch in chapter*; do
-    if ! [ -e "../$ch" ]; then
-        rm -rf $ch
-    else
-        shopt -s nullglob
-        for f in $ch/*.md $ch/*.ipynb; do
-            base=$(basename $f)
-            md=${base%%.*}.md
-            if ! [ -e "../$ch/$md" ]; then
-                rm $f
-            fi
-        done
-    fi
-done
-# Clean images that are no longer needed.
-shopt -s nullglob
-for f in img/*.svg img/*.jpg img/*.png; do
-    if ! [ -e "../$f" ]; then
-        rm $f
-    fi
-done
-cd ..
-
-
-git submodule update --init
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-
-conda env update -f build/env.yml
-conda activate d2l-zh-build
-
-pip list
-rm -rf build/_build/
-
-make html
-
-make pdf
-cp build/_build/latex/d2l-zh.pdf build/_build/html/
-
-[ -e build/_build/latex/d2l-zh.aux ] && rm build/_build/latex/d2l-zh.aux
-[ -e build/_build/latex/d2l-zh.idx ] && rm build/_build/latex/d2l-zh.idx
-
-# avoid putting data downloaded by scripts into the notebook package
-mv build/data build/data-bak
-make pkg
-# backup build/data to avoid download the dataset each time and put the
-rm -rf build/data
-mv build/data-bak build/data
-
-# For 1.0
-cp build/_build/html/d2l-zh.zip build/_build/html/d2l-zh-1.0.zip
-
-# Time it
-tok=$(date +%s)
-runtime=$((tok-tik))
-convertsecs() {
-	((h=${1}/3600))
-	((m=(${1}%3600)/60))
-	((s=${1}%60))
-	printf "%02d:%02d:%02d\n" $h $m $s
-}
-echo $(convertsecs $runtime)
--- a/build/convert_output_svg.sh
+++ b/build/convert_output_svg.sh
-set -x
-set -e
-for f in *.svg; do
-    rsvg-convert -f pdf -z 0.80 -o ${f%.svg}.pdf $f
-done
--- a/build/env.yml
+++ b/build/env.yml
@@ -11,6 +11,6 @@ dependencies:
  - recommonmark==0.4.0
  - https://github.com/mli/notedown/tarball/master
  - https://s3-us-west-2.amazonaws.com/szha-experiments/mxnet_cu92-1.4.0b20181219-py2.py3-none-manylinux1_x86_64.whl
-  - gluonbook==0.8.8
+  - gluonbook==0.8.10
  - jieba==0.39
  - awscli
--- a/build/frontpage.html
+++ b/build/frontpage.html
@@ -12,6 +12,8 @@
  </div>
 </div>

+
+
 <div class = "authors mdl-grid">
  <div class = "mdl-cell mdl-cell--3-col mdl-cell--top">
    <div class="author-item">
@@ -45,6 +47,58 @@
  <h4> <a href="https://discuss.gluon.ai/t/topic/7571">成为贡献者，你将会获得专享版赠书并被致谢</a> </h4>
 </div>

+<div class="endorsement mdl-grid">
+
+  <div class = "mdl-cell mdl-cell--6-col mdl-cell--top">
+    <h2>学术界推荐</h2>
+  <div class="endorse">
+    <h3>“Dive into this book if you want to dive into deep learning!”</h3>
+    <p class="author">韩家炜</p>
+    <p class="title">ACM 院士、IEEE 院士<br>美国伊利诺伊大学香槟分校计算机系 Abel Bliss 教授
+    </p>
+  </div>
+
+  <div class="endorse">
+    <h3>“This is a highly welcome addition to the machine learning literature.”</h3>
+    <p class="author">Bernhard Schölkopf</p>
+    <p class="title">ACM 院士、德国国家科学院院士<br>德国马克斯•普朗克研究所智能系统院院长</p>
+  </div>
+
+  <div class="endorse">
+    <h3>“书中代码可谓‘所学即所用’。”</h3>
+    <p class="author">周志华</p>
+    <p class="title">ACM 院士、IEEE 院士、AAAS 院士<br>南京大学计算机科学与技术系主任</p>
+  </div>
+  </div>
+
+  <div class = "mdl-cell mdl-cell--6-col mdl-cell--top">
+    <h2>工业界推荐</h2>
+  <div class="endorse">
+    <h3>“《动手学深度学习》是最适合工业界研发工程师学习的。我毫无保留地向广大的读者们强烈推荐。”
+    </h3>
+    <p class="author">余凯</p>
+    <p class="title">地平线公司创始人 & CEO </p>
+  </div>
+
+  <div class="endorse">
+    <h3>“强烈推荐这本书！我特别赞赏这种手脑一体的学习方式。”</h3>
+    <p class="author">漆远</p>
+    <p class="title">蚂蚁金服副总裁、首席AI科学家</p>
+  </div>
+
+  <div class="endorse">
+    <h3>“这本书可以帮助深度学习实践者快速提升自己的能力。”</h3>
+    <p class="author">张潼</p>
+    <p class="title">腾讯 AI Lab 主任</p>
+  </div>
+
+  <div class="endorse">
+    <h3>“《动手学深度学习》是一本很容易让学习者上瘾的书。”</h3>
+    <p class="author">沈强</p>
+    <p class="title">将门创投创始合伙人</p>
+  </div>
+  </div>
+</div>
 <div class = "features mdl-grid">
  <div class = "mdl-cell mdl-cell--5-col mdl-cell--middle">
    <div class="content">

--- a/build/ipynb2mdd.sh
+++ b/build/ipynb2mdd.sh
-#!/bin/bash
-
-MD="mdd"
-CH="ch.md"
-
-[ -e $MD ] && rm -rf $MD
-mkdir $MD
-
-# Collect files.
-cp index.rst $MD/
-cp -R img $MD/
-for f in chapter*/*; do
-	dir=$(dirname "$f")
-	if [ "${f##*.}" = "md" ] || [ "${f##*.}" = "ipynb" ]; then
-		mkdir -p $MD/$dir
-		cp $f $MD/$f
-	fi
-done
-
-# ipynb to md.
-for f in $MD/chapter*/*ipynb; do
-    base=$(basename $f)
-    jupyter nbconvert --to markdown $f --output "${base%%.*}.md"
-	rm $f
-done
-
-for f in $MD/chapter*/*md; do
-	dir=$(dirname "$f")
-	# Remove inner link.
-	sed -i 's/\[\([^]]*\)\]([^\)]*.md)/\1/g' $f
-	# Refer pdf instead of svg.
-	sed -i s/\\.svg/.pdf/g $f
-	# Refer img in the same level.
-	sed -i 's/\](..\/img/\](img/g' $f
-	if [ "$f" != "$dir/index.md" ]; then
-		sed -i s/#\ /##\ /g $f
-	fi
-done
-
-# Convert svg to pdf.
-for f in $MD/img/*svg; do
-	rsvg-convert -f pdf -z 0.80 -o "${f%%.*}.pdf" $f
-	rm $f
-done
-
-# Concat sections in each chapter.
-for f in $MD/chapter*/index.md; do
-	sections=$(python -c 'import mdd_utils; print(mdd_utils.get_sections())' $f)
-	dir=$(dirname "$f")
-	chapter=$dir/$CH
-	cat $f $sections > $chapter
-	perl -i -0777 -pe 's/```eval_rst[^`]+```//ge' $chapter
-done
-
-chapters=$(python -c 'import mdd_utils; print(mdd_utils.get_chapters())' $MD/index.rst)
-i=1
-for chapter in $chapters; do
-	# Move matplotlib plots outside.
-	mv $MD/$chapter/*_files $MD/
-	# Move ch.md to ../ch0x.md
-	mv $MD/$chapter/$CH $MD/ch$(printf %02d $i).md
-	rm -rf $MD/$chapter
-	i=$((i + 1))
-done
-
-# Convert matplotlib-generated svg to pdf.
-for f in $MD/*_files/*svg; do
-	rsvg-convert -f pdf -z 0.80 -o "${f%%.*}.pdf" $f
-	rm $f
-done
-
-rm $MD/toc.rst
-
-# zip files.
-[ -e "$MD.zip" ] && rm "$MD.zip"
-zip -r "$MD.zip" $MD
-[ -e $MD ] && rm -rf $MD
--- a/build/lint.sh
+++ b/build/lint.sh
-#!/bin/bash                                                                                                   
-
-# Prerequisite: pip install nblint
-
-OUT=outlint
-
-[ -e $OUT ] && rm $OUT
-
-for f in build/chapter*/*.ipynb; do
-	echo '===' $f
-	echo '===' $f >> $OUT
-	nblint --linter pyflakes $f >> $OUT 
-	nblint $f >> $OUT
-done
-
-# E302 expected 2 blank lines, found 1
-# E305 expected 2 blank lines after class or function definition, found 1
-# E402 module level import not at top of file
-# E703 statement ends with a semicolon
-# E741 ambiguous variable name
-IGNORE=( 'E302' 
-		 'E305'
-		 'E402' 
-		 'E703'
-		 'E741' )
-
-for ign in "${IGNORE[@]}"; do
-	sed -i /$ign/d $OUT
-done
--- a/build/md2ipynb.py
+++ b/build/md2ipynb.py
-import sys
-import os
-import time
-import notedown
-import nbformat
-
-assert len(sys.argv) == 3, 'usage: input.md output.ipynb'
-
-def is_ascii(character):
-    return ord(character) <= 128
-
-def add_space_between_ascii_and_non_ascii(string):
-    punc = {' ', '\n', '\t', '\r', '，', '。', '？', '！', '、',
-            '；', '：', '“', '”', '（', '）', '【', '】', '—',
-            '…', '《', '》', '`', '(', ')', '[', ']', ',', '.',
-            '?', '!', ';', ':', '\'', '"'}
-    if len(string) == 0:
-        return ''
-
-    ret = []
-    # We don't allow space within figure cpations, such as ![](). 
-    is_fig_caption = False
-    num_left_brackets = 0
-    for i in range(len(string) - 1):
-        cur_char = string[i]
-        next_char = string[i + 1]
-        if cur_char == '[':
-            if i > 0 and string[i - 1] == '!':
-                is_fig_caption = True
-            else:
-                num_left_brackets += 1
-        elif cur_char == ']':
-            if num_left_brackets > 0:
-                num_left_brackets -= 1
-            else:
-                is_fig_caption = False
-
-        ret.append(cur_char)
-        if ((is_ascii(cur_char) != is_ascii(next_char))
-            and (cur_char not in punc)
-            and (next_char not in punc)
-            and not is_fig_caption):
-            ret.append(' ')
-
-    ret.append(string[-1])
-    return ''.join(ret)
-
-# timeout for each notebook, in sec
-timeout = 20 * 60
-
-# the files will be ingored for execution
-ignore_execution = []
-
-input_fn = sys.argv[1]
-output_fn = sys.argv[2]
-
-reader = notedown.MarkdownReader(match='strict')
-
-do_eval = int(os.environ.get('EVAL', True))
-
-# read
-with open(input_fn, 'r') as f:
-    notebook = reader.read(f)
-
-for c in notebook.cells:
-    c.source = add_space_between_ascii_and_non_ascii(c.source)
-
-if do_eval and not any([i in input_fn for i in ignore_execution]):
-    tic = time.time()
-    notedown.run(notebook, timeout)
-    print('=== Finished evaluation in %f sec'%(time.time()-tic))
-
-# write
-# need to add language info to for syntax highlight
-notebook['metadata'].update({'language_info':{'name':'python'}})
-
-with open(output_fn, 'w') as f:
-    f.write(nbformat.writes(notebook))
--- a/build/mdd_utils.py
+++ b/build/mdd_utils.py
-import os
-import sys
-
-def get_sections():
-    assert len(sys.argv) == 2
-    index_md = sys.argv[1]
-    dirname = os.path.dirname(index_md)
-
-    start = False
-    sections = []
-    with open(index_md) as f:
-        for line in f:
-            line = line.rstrip().lstrip()
-            if ':maxdepth:' in line:
-                start = True
-                continue
-            elif line == '```':
-                break
-            if start and len(line) > 1:
-                sections.append(os.path.join(dirname, line + '.md'))
-    return ' '.join(sections)
-
-
-def get_chapters():
-    assert len(sys.argv) == 2
-    index_md = sys.argv[1]
-
-    start = False
-    chapters = []
-    with open(index_md) as f:
-        for line in f:
-            line = line.rstrip().lstrip()
-            if ':maxdepth:' in line:
-                start = True
-                continue
-            elif line == '```':
-                break
-            if start and len(line) > 1:
-                chapters.append(line.split('/')[0])
-    return ' '.join(chapters)
-
--- a/build/translate.sh
+++ b/build/translate.sh
-#!/bin/bash
-
-pre() {
-	echo "Pre-processing markdown files in source lauguage.";
-	for f in chapter*/*.md; do
-		echo $f
-		sed -i s/\.python\ \.input/\.python-\.input/g $f
-		sed -i s/\.input\ \ n=/\.input-n=/g $f
-		sed -i s/\</%%%less-than%%%/g $f
-		sed -i s/\&/%%%ampersand%%%/g $f
-	done
-}
-
-extract() {
-	echo "Convert markdown files into xliff (in source language) and skeleton files.";
-	BSL="bookSrcLang"
-	[ -e $BSL ] && rm -rf $BSL
-	mkdir -p $BSL
-	for f in chapter*/*.md; do
-		echo $f
-		xlf="${f%%.*}.xlf"
-		sklmd="${f%%.*}.skl.md"
-		./md2xliff/bin/extract $f $xlf $sklmd 'zh-CN' 'en-US'
-		# Generate bookSrcLang that contains only xlf files.
-		dir=$(dirname "$f")
-		mkdir -p $BSL/$dir
-		base=$(basename $f)
-		xlf_base="${base%%.*}.xlf"
-		cp $xlf $BSL/$dir/$xlf_base
-	done
-}
-
-reconstruct() {
-	echo "Convert xliff (in target language) and skeleton files into markdown files.";
-	BTL="bookTgtLang"
-	for f in chapter*/*.xlf; do
-		echo $f
-		# Load xlf files from translated dir.
-		cp $BTL/$f $f
-		md="${f%%.*}.md"
-		sklmd="${f%%.*}.skl.md"
-		./md2xliff/bin/xliff-reconstruct $f $sklmd $md
-	#rm $f
-	#rm $sklmd
-	done
-}
-
-post() {
-	echo "Post-processing markdown files in target language.";
-	for f in chapter*/*.md; do
-		echo $f
-		sed -i s/\.python-\.input/\.python\ \.input/g $f
-		sed -i s/\.input-n=/\.input\ \ n=/g $f
-		sed -i s/%%%less-than%%%/\</g $f
-		sed -i s/%%%ampersand%%%/\\\&/g $f
-	done
-}
-
-"$@"
--- a/build/upload.sh
+++ b/build/upload.sh
-#!/bin/bash
-set -e
-
-conda activate d2l-zh-build
-
-# BUCKET=s3://zh.diveintodeeplearning.org
-# BUCKET=s3://diveintodeeplearning-staging
-
-DIR=build/_build/html/
-
-aws s3 sync --delete $DIR s3://zh.d2l.ai --acl 'public-read' --quiet
-aws s3 sync --delete $DIR s3://zh.diveintodeeplearning.org --acl 'public-read' --quiet
-
-#find $DIR \( -iname '*.css' -o -iname '*.js' \) -exec gzip -9 -n {} \; -exec mv {}.gz {} \;
-
-#aws s3 sync --exclude '*.*' --include '*.css' \
-#      --content-type 'text/css' \
-#      --content-encoding 'gzip' \
-#      --acl 'public-read' \
-#      $DIR $BUCKET
-
-#aws s3 sync --exclude '*.*' --include '*.woff' --include '*.woff2' \
-#      --expires "$(date -d '+24 months' --utc +'%Y-%m-%dT%H:%M:%SZ')" \
-#      --acl 'public-read' --quiet \
-#      $DIR $BUCKET
-
-#aws s3 sync --exclude '*.*' --include '*.js' \
-#      --content-type 'application/javascript' \
-#      --content-encoding 'gzip' \
-#      --acl 'public-read' \
-#      $DIR $BUCKET
-
-#aws s3 sync --delete $DIR $BUCKET --acl 'public-read' --quiet
--- a/utils @ 9f282ab1
+++ b/utils @ 9f282ab1
+Subproject commit 9f282ab1249dee696d17434f2f7969295bf2a13c
--- a/build/win_batch_md2ipynb.py
+++ b/build/win_batch_md2ipynb.py
-from distutils.dir_util import copy_tree
-import glob
-import nbformat
-import notedown
-import os
-from subprocess import check_output
-import sys
-import time
-
-# To access data/imgs/gluonbook in upper level.
-os.chdir('build')
-
-def mkdir_if_not_exist(path):
-    if not os.path.exists(os.path.join(*path)):
-        os.makedirs(os.path.join(*path))
-
-# Timeout for each notebook, in sec
-timeout = 60 * 60
-
-# The files will be ingored for execution
-ignore_execution = ['chapter_computational-performance/async-computation.md']
-
-reader = notedown.MarkdownReader(match='strict')
-
-do_eval = int(os.environ.get('EVAL', True))
-
-
-for chap in glob.glob(os.path.join('..', 'chapter_*')):
-    mkdir_if_not_exist(['win_ipynb', chap[3:]])
-    mds = filter(lambda x: x.endswith('md'), os.listdir(chap))
-    for md in mds:
-        if md != 'index.md':
-            in_md = os.path.join(chap, md)
-            out_nb = os.path.join('win_ipynb', in_md[3:-2] + 'ipynb')
-
-            if not os.path.exists(out_nb):
-
-                print('---', in_md[3:])
-                # read
-                with open(in_md, 'r', encoding="utf8") as f:
-                    notebook = reader.read(f)
-
-                if do_eval and chap[3:] + '/' + md not in ignore_execution:
-                    tic = time.time()
-                    notedown.run(notebook, timeout)
-                    print('=== Finished evaluation in %f sec'%(time.time()-tic))
-
-                # write
-                # need to add language info to for syntax highlight
-                notebook['metadata'].update({'language_info':{'name':'python'}})
-
-                with open(out_nb, 'w', encoding="utf8") as f:
-                    f.write(nbformat.writes(notebook))
-
-
--- a/chapter_appendix/aws.md
+++ b/chapter_appendix/aws.md
@@ -24,11 +24,11 @@
 ![EC2面板。](../img/ec2.png)


-图11.10的最上面一行显示了配置实例所需的7个步骤。在第一步“1. Chosse AMI”中，选择Ubuntu 16.04作为操作系统。
+图11.10的最上面一行显示了配置实例所需的7个步骤。在第一步“1. Choose AMI”中，选择Ubuntu 16.04作为操作系统。

 ![选择操作系统。](../img/os.png)

-EC2提供了大量不同配置的实例。如图11.11所示，在第二步“2. Chosse Instance Type”中，选择有一个K80 GPU的“p2.xlarge”实例。我们也可以选择像“p2.16xlarge”这样有多个GPU的实例。如果你想比较不同实例的机器配置和收费，可参考 https://www.ec2instances.info/ 。
+EC2提供了大量不同配置的实例。如图11.11所示，在第二步“2. Choose Instance Type”中，选择有一个K80 GPU的“p2.xlarge”实例。我们也可以选择像“p2.16xlarge”这样有多个GPU的实例。如果你想比较不同实例的机器配置和收费，可参考 https://www.ec2instances.info/ 。

 ![选择实例。](../img/p2x.png)

@@ -177,7 +177,7 @@ ssh -i "/path/to/key.pem" ubuntu@ec2-xx-xxx-xxx-xxx.y.compute.amazonaws.com -L 8

 如果较短时间内还将重新开启实例，右击图11.16中的示例，选择“Instance State” $\rightarrow$ “Stop”将实例停止，等下次使用时选择“Instance State” $\rightarrow$ “Start”重新开启实例。这种情况下，开启的实例将保留其停止前硬盘上的存储（例如无需再安装CUDA和其他运行环境）。然而，停止状态的实例也会因其所保留的硬盘空间而产生少量计费。

-如果较长时间内不会重新开启实例，右击图11.16中的示例，选择“Image” $\rightarrow$ “Create”创建镜像。然后，选择“Instance State” $\rightarrow$ “Terminate”将实例终结（硬盘不再产生计费）。当下次使用时，我们可按本节中创建并运行EC2实例的步骤重新创建一个基于保存镜像的实例。唯一的区别在于，在图11.10的第一步“1. Chosse AMI”中，我们需要通过左栏“My AMIs”选择之前保存的镜像。这样创建的实例将保留镜像上硬盘的存储，例如无需再安装CUDA和其他运行环境。
+如果较长时间内不会重新开启实例，右击图11.16中的示例，选择“Image” $\rightarrow$ “Create”创建镜像。然后，选择“Instance State” $\rightarrow$ “Terminate”将实例终结（硬盘不再产生计费）。当下次使用时，我们可按本节中创建并运行EC2实例的步骤重新创建一个基于保存镜像的实例。唯一的区别在于，在图11.10的第一步“1. Choose AMI”中，我们需要通过左栏“My AMIs”选择之前保存的镜像。这样创建的实例将保留镜像上硬盘的存储，例如无需再安装CUDA和其他运行环境。

 ## 小结


--- a/chapter_appendix/buy-gpu.md
+++ b/chapter_appendix/buy-gpu.md
@@ -20,10 +20,6 @@ GPU的性能主要由以下三个参数构成：

 对于大部分用户来说，只要考虑计算能力就可以了。显存尽量不小于4GB。但如果GPU要同时显示图形界面，那么推荐的显存大小至少为6GB。显存带宽通常相对固定，选择空间较小。

-一般来说，若想利用到GPU强大的计算能力，该显卡的CUDA Compute Capability 需不低于3.0，可以到NVIDIA官方网站 (https://developer.nvidia.com/cuda-gpus) 查询自己所用显卡的CUDA Compute Capability。下图是NVIDIA GeForce系列部分显卡的Compute Capability。  
-  
-![](https://i.imgur.com/DwaDMaB.png)  
-
 图11.19描绘了GTX 900和1000系列里各个型号的32位浮点计算能力和价格的对比。其中价格为Wikipedia的建议价格。

 ![浮点计算能力和价格的对比。](../img/gtx.png)

--- a/chapter_appendix/gluonbook.md
+++ b/chapter_appendix/gluonbook.md
@@ -3,7 +3,6 @@

 函数、类等名称：定义所在章节

-* `accuracy`：[Softmax回归的从零开始实现](../chapter_deep-learning-basics/softmax-regression-scratch.md)
 * `bbox_to_rect`：[物体检测和边界框](../chapter_computer-vision/bounding-box.md)
 * `Benchmark`：[异步计算](../chapter_computational-performance/async-computation.md)
 * `corr2d`：[二维卷积层](../chapter_convolutional-neural-networks/conv-layer.md)

--- a/chapter_computational-performance/multiple-gpus-gluon.md
+++ b/chapter_computational-performance/multiple-gpus-gluon.md
@@ -96,7 +96,7 @@ def train(num_gpus, batch_size, lr):
        nd.waitall()
        train_time = time.time() - start
        test_acc = gb.evaluate_accuracy(test_iter, net, ctx[0])
-        print('epoch %d, training time: %.1f sec, test_acc %.2f' % (
+        print('epoch %d, time: %.1f sec, test acc %.2f' % (
            epoch + 1, train_time, test_acc))
 ```


--- a/chapter_computer-vision/image-augmentation.md
+++ b/chapter_computer-vision/image-augmentation.md
@@ -11,7 +11,7 @@ import mxnet as mx
 from mxnet import autograd, gluon, image, init, nd
 from mxnet.gluon import data as gdata, loss as gloss, utils as gutils
 import sys
-from time import time
+import time
 ```

 ## 常用的图像增广方法
@@ -162,8 +162,7 @@ def _get_batch(batch, ctx):
        labels = labels.astype(features.dtype)
    # 当 ctx 包含多个 GPU 时，划分小批量数据样本并复制到各个 GPU 上。
    return (gutils.split_and_load(features, ctx),
-            gutils.split_and_load(labels, ctx),
-            features.shape[0])
+            gutils.split_and_load(labels, ctx), features.shape[0])
 ```

 然后，我们定义`evaluate_accuracy`函数评价模型的分类准确率。与[“Softmax回归的从零开始实现”](../chapter_deep-learning-basics/softmax-regression-scratch.md)和[“卷积神经网络（LeNet）”](../chapter_convolutional-neural-networks/lenet.md)两节中描述的`evaluate_accuracy`函数不同，这里定义的函数更加通用：它通过辅助函数`_get_batch`使用`ctx`变量所包含的所有GPU来评价模型。
@@ -173,16 +172,15 @@ def _get_batch(batch, ctx):
 def evaluate_accuracy(data_iter, net, ctx=[mx.cpu()]):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
-    acc = nd.array([0])
-    n = 0
+    acc_sum, n = nd.array([0]), 0
    for batch in data_iter:
        features, labels, _ = _get_batch(batch, ctx)
        for X, y in zip(features, labels):
            y = y.astype('float32')
-            acc += (net(X).argmax(axis=1) == y).sum().copyto(mx.cpu())
+            acc_sum += (net(X).argmax(axis=1) == y).sum().copyto(mx.cpu())
            n += y.size
-        acc.wait_to_read()
-    return acc.asscalar() / n
+        acc_sum.wait_to_read()
+    return acc_sum.asscalar() / n
 ```

 接下来，我们定义`train`函数使用多GPU训练并评价模型。
@@ -194,8 +192,7 @@ def train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    for epoch in range(num_epochs):
-        train_l_sum, train_acc_sum, n, m = 0.0, 0.0, 0.0, 0.0
-        start = time()
+        train_l_sum, train_acc_sum, n, m, start = 0.0, 0.0, 0, 0, time.time()
        for i, batch in enumerate(train_iter):
            Xs, ys, batch_size = _get_batch(batch, ctx)
            ls = []
@@ -204,17 +201,17 @@ def train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs):
                ls = [loss(y_hat, y) for y_hat, y in zip(y_hats, ys)]
            for l in ls:
                l.backward()
+            trainer.step(batch_size)
+            train_l_sum += sum([l.sum().asscalar() for l in ls])
+            n += sum([l.size for l in ls])
            train_acc_sum += sum([(y_hat.argmax(axis=1) == y).sum().asscalar()
                                 for y_hat, y in zip(y_hats, ys)])
-            train_l_sum += sum([l.sum().asscalar() for l in ls])
-            trainer.step(batch_size)
-            n += batch_size
            m += sum([y.size for y in ys])
        test_acc = evaluate_accuracy(test_iter, net, ctx)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, '
              'time %.1f sec'
              % (epoch + 1, train_l_sum / n, train_acc_sum / m, test_acc,
-                 time() - start))
+                 time.time() - start))
 ```

 现在，我们可以定义`train_with_data_aug`函数使用图像增广来训练模型了。该函数获取了所有可用的GPU，并将Adam作为训练使用的优化算法，然后将图像增广应用于训练数据集之上，最后调用刚才定义的`train`函数训练并评价模型。

--- a/chapter_computer-vision/index.md
+++ b/chapter_computer-vision/index.md
 # 计算机视觉

-无论是医疗诊断、无人车、摄像监控还是智能滤镜，计算机视觉领域的诸多应用都与我们当下和未来的生活息息相关。近年来，深度学习技术深刻推动了计算机视觉系统性能的提升。可以说，当下最先进的计算机视觉应用几乎离不开深度学习。
+无论是医疗诊断、无人车、摄像监控，还是智能滤镜，计算机视觉领域的诸多应用都与我们当下和未来的生活息息相关。近年来，深度学习技术深刻推动了计算机视觉系统性能的提升。可以说，当下最先进的计算机视觉应用几乎离不开深度学习。有鉴于此，本章将关注计算机视觉领域，并从中挑选时下在学术界和工业界具有影响力的方法与应用来展示深度学习的魅力。

-我们在“卷积神经网络”一章中已经介绍了计算机视觉领域常使用的深度学习模型，并实践了简单的图像分类任务。本章中，我们先进一步介绍图像增广（image augmentation）和微调（fine tuning）的方法，并将它们应用于图像分类。然后，我们会探究目标检测（object detection）的各类方法。之后，我们将了解如何使用全卷积网络对图像做语义分割（semantic segmentation）。接下来，我们再解释如何使用样式迁移技术生成像本书封面一样的图像。最后，我们在两个计算机视觉的重要数据集上实践本章和前几章的内容。
+我们在“卷积神经网络”一章中已经介绍了计算机视觉领域常使用的深度学习模型，并实践了简单的图像分类任务。在本章的开头，我们介绍图像增广和微调的方法，并把它们应用于图像分类。由于深度神经网络能够对图像逐级有效地进行表征，这一特性被广泛应用在目标检测、语义分割和样式迁移这些主流计算机视觉任务中，并取得了成功。围绕这一核心思想，首先，我们将描述目标检测的工作流程与各类方法。之后，我们将探究如何使用全卷积网络对图像做语义分割。接下来，我们再解释如何使用样式迁移技术生成像本书封面一样的图像。最后，我们在两个计算机视觉的重要数据集上实践本章和前几章的知识。

 ```eval_rst


--- a/chapter_computer-vision/kaggle-gluon-cifar10.md
+++ b/chapter_computer-vision/kaggle-gluon-cifar10.md
@@ -184,13 +184,13 @@ test_ds = gdata.vision.ImageFolderDataset(
 我们在`DataLoader`中指明定义好的图像增广操作。在训练时，我们仅用验证集评价模型，因此需要保证输出的确定性。在预测时，我们将在训练集和验证集的并集上训练模型，以充分利用所有标注的数据。

 ```{.python .input}
-train_data = gdata.DataLoader(train_ds.transform_first(transform_train),
+train_iter = gdata.DataLoader(train_ds.transform_first(transform_train),
                              batch_size, shuffle=True, last_batch='keep')
-valid_data = gdata.DataLoader(valid_ds.transform_first(transform_test),
+valid_iter = gdata.DataLoader(valid_ds.transform_first(transform_test),
                              batch_size, shuffle=True, last_batch='keep')
-train_valid_data = gdata.DataLoader(train_valid_ds.transform_first(
+train_valid_iter = gdata.DataLoader(train_valid_ds.transform_first(
    transform_train), batch_size, shuffle=True, last_batch='keep')
-test_data = gdata.DataLoader(test_ds.transform_first(transform_test),
+test_iter = gdata.DataLoader(test_ds.transform_first(transform_test),
                             batch_size, shuffle=False, last_batch='keep')
 ```

@@ -263,33 +263,33 @@ loss = gloss.SoftmaxCrossEntropyLoss()
 我们将根据模型在验证集上的表现来选择模型并调节超参数。下面定义了模型的训练函数`train`。我们记录了每个迭代周期的训练时间，这有助于比较不同模型的时间开销。

 ```{.python .input  n=12}
-def train(net, train_data, valid_data, num_epochs, lr, wd, ctx, lr_period,
+def train(net, train_iter, valid_iter, num_epochs, lr, wd, ctx, lr_period,
          lr_decay):
    trainer = gluon.Trainer(net.collect_params(), 'sgd',
                            {'learning_rate': lr, 'momentum': 0.9, 'wd': wd})
    for epoch in range(num_epochs):
-        train_l, train_acc, start = 0.0, 0.0, time.time()
+        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        if epoch > 0 and epoch % lr_period == 0:
            trainer.set_learning_rate(trainer.learning_rate * lr_decay)
-        for X, y in train_data:
+        for X, y in train_iter:
            y = y.astype('float32').as_in_context(ctx)
            with autograd.record():
                y_hat = net(X.as_in_context(ctx))
-                l = loss(y_hat, y)
+                l = loss(y_hat, y).sum()
            l.backward()
            trainer.step(batch_size)
-            train_l += l.mean().asscalar()
-            train_acc += gb.accuracy(y_hat, y)
+            train_l_sum += l.asscalar()
+            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
+            n += y.size
        time_s = "time %.2f sec" % (time.time() - start)
-        if valid_data is not None:
-            valid_acc = gb.evaluate_accuracy(valid_data, net, ctx)
+        if valid_iter is not None:
+            valid_acc = gb.evaluate_accuracy(valid_iter, net, ctx)
            epoch_s = ("epoch %d, loss %f, train acc %f, valid acc %f, "
-                       % (epoch + 1, train_l / len(train_data),
-                          train_acc / len(train_data), valid_acc))
+                       % (epoch + 1, train_l_sum / n, train_acc_sum / n,
+                       valid_acc))
        else:
            epoch_s = ("epoch %d, loss %f, train acc %f, " %
-                       (epoch + 1, train_l / len(train_data),
-                        train_acc / len(train_data)))
+                       (epoch + 1, train_l_sum / n, train_acc_sum / n))
        print(epoch_s + time_s + ', lr ' + str(trainer.learning_rate))
 ```

@@ -301,7 +301,7 @@ def train(net, train_data, valid_data, num_epochs, lr, wd, ctx, lr_period,
 ctx, num_epochs, lr, wd = gb.try_gpu(), 1, 0.1, 5e-4
 lr_period, lr_decay, net = 80, 0.1, get_net(ctx)
 net.hybridize()
-train(net, train_data, valid_data, num_epochs, lr, wd, ctx, lr_period,
+train(net, train_iter, valid_iter, num_epochs, lr, wd, ctx, lr_period,
      lr_decay)
 ```

@@ -312,10 +312,10 @@ train(net, train_data, valid_data, num_epochs, lr, wd, ctx, lr_period,
 ```{.python .input  n=14}
 net, preds = get_net(ctx), []
 net.hybridize()
-train(net, train_valid_data, None, num_epochs, lr, wd, ctx, lr_period,
+train(net, train_valid_iter, None, num_epochs, lr, wd, ctx, lr_period,
      lr_decay)

-for X, _ in test_data:
+for X, _ in test_iter:
    y_hat = net(X.as_in_context(ctx))
    preds.extend(y_hat.argmax(axis=1).astype(int).asnumpy())
 sorted_ids = list(range(1, len(test_ds) + 1))

--- a/chapter_computer-vision/kaggle-gluon-dog.md
+++ b/chapter_computer-vision/kaggle-gluon-dog.md
@@ -167,13 +167,13 @@ test_ds = gdata.vision.ImageFolderDataset(
 这里创建`DataLoader`实例的方法也与上一节中的相同。

 ```{.python .input}
-train_data = gdata.DataLoader(train_ds.transform_first(transform_train),
+train_iter = gdata.DataLoader(train_ds.transform_first(transform_train),
                              batch_size, shuffle=True, last_batch='keep')
-valid_data = gdata.DataLoader(valid_ds.transform_first(transform_test),
+valid_iter = gdata.DataLoader(valid_ds.transform_first(transform_test),
                              batch_size, shuffle=True, last_batch='keep')
-train_valid_data = gdata.DataLoader(train_valid_ds.transform_first(
+train_valid_iter = gdata.DataLoader(train_valid_ds.transform_first(
    transform_train), batch_size, shuffle=True, last_batch='keep')
-test_data = gdata.DataLoader(test_ds.transform_first(transform_test),
+test_iter = gdata.DataLoader(test_ds.transform_first(transform_test),
                             batch_size, shuffle=False, last_batch='keep')
 ```

@@ -203,14 +203,15 @@ def get_net(ctx):
 ```{.python .input}
 loss = gloss.SoftmaxCrossEntropyLoss()

-def get_loss(data, net, ctx):
-    l = 0.0
-    for X, y in data:
+def evaluate_loss(data_iter, net, ctx):
+    l_sum, n = 0.0, 0
+    for X, y in data_iter:
        y = y.as_in_context(ctx)
        output_features = net.features(X.as_in_context(ctx))
        outputs = net.output_new(output_features)
-        l += loss(outputs, y).mean().asscalar()
-    return l / len(data)
+        l_sum += loss(outputs, y).sum().asscalar()
+        n += y.size
+    return l_sum / n
 ```

 ## 定义训练函数
@@ -218,32 +219,33 @@ def get_loss(data, net, ctx):
 我们将依赖模型在验证集上的表现来选择模型并调节超参数。模型的训练函数`train`只训练自定义的小规模输出网络。

 ```{.python .input  n=7}
-def train(net, train_data, valid_data, num_epochs, lr, wd, ctx, lr_period,
+def train(net, train_iter, valid_iter, num_epochs, lr, wd, ctx, lr_period,
          lr_decay):
    # 只训练我们定义的小规模输出网络。
    trainer = gluon.Trainer(net.output_new.collect_params(), 'sgd',
                            {'learning_rate': lr, 'momentum': 0.9, 'wd': wd})
    for epoch in range(num_epochs):
-        train_l, start = 0.0, time.time()
+        train_l_sum, n, start = 0.0, 0, time.time()
        if epoch > 0 and epoch % lr_period == 0:
            trainer.set_learning_rate(trainer.learning_rate * lr_decay)
-        for X, y in train_data:
-            y = y.astype('float32').as_in_context(ctx)
+        for X, y in train_iter:
+            y = y.as_in_context(ctx)
            output_features = net.features(X.as_in_context(ctx))
            with autograd.record():
                outputs = net.output_new(output_features)
-                l = loss(outputs, y)
+                l = loss(outputs, y).sum()
            l.backward()
            trainer.step(batch_size)
-            train_l += l.mean().asscalar()
+            train_l_sum += l.asscalar()
+            n += y.size
        time_s = "time %.2f sec" % (time.time() - start)
-        if valid_data is not None:
-            valid_loss = get_loss(valid_data, net, ctx)
+        if valid_iter is not None:
+            valid_loss = evaluate_loss(valid_iter, net, ctx)
            epoch_s = ("epoch %d, train loss %f, valid loss %f, "
-                       % (epoch + 1, train_l / len(train_data), valid_loss))
+                       % (epoch + 1, train_l_sum / n, valid_loss))
        else:
            epoch_s = ("epoch %d, train loss %f, "
-                       % (epoch + 1, train_l / len(train_data)))
+                       % (epoch + 1, train_l_sum / n))
        print(epoch_s + time_s + ', lr ' + str(trainer.learning_rate))
 ```

@@ -255,7 +257,7 @@ def train(net, train_data, valid_data, num_epochs, lr, wd, ctx, lr_period,
 ctx, num_epochs, lr, wd = gb.try_gpu(), 1, 0.01, 1e-4
 lr_period, lr_decay, net = 10, 0.1, get_net(ctx)
 net.hybridize()
-train(net, train_data, valid_data, num_epochs, lr, wd, ctx, lr_period,
+train(net, train_iter, valid_iter, num_epochs, lr, wd, ctx, lr_period,
      lr_decay)
 ```

@@ -266,11 +268,11 @@ train(net, train_data, valid_data, num_epochs, lr, wd, ctx, lr_period,
 ```{.python .input  n=8}
 net = get_net(ctx)
 net.hybridize()
-train(net, train_valid_data, None, num_epochs, lr, wd, ctx, lr_period,
+train(net, train_valid_iter, None, num_epochs, lr, wd, ctx, lr_period,
      lr_decay)

 preds = []
-for data, label in test_data:
+for data, label in test_iter:
    output_features = net.features(data.as_in_context(ctx))
    output = nd.softmax(net.output_new(output_features))
    preds.extend(output.asnumpy())

--- a/chapter_computer-vision/ssd.md
+++ b/chapter_computer-vision/ssd.md
@@ -193,7 +193,7 @@ print('output bbox preds:', bbox_preds.shape)

 ```{.python .input  n=14}
 batch_size = 32
-train_data, test_data = gb.load_data_pikachu(batch_size)
+train_iter, _ = gb.load_data_pikachu(batch_size)
 ```

 在皮卡丘数据集中，目标的类别数为1。定义好模型以后，我们需要初始化模型参数并定义优化算法。
@@ -224,10 +224,10 @@ def calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels, bbox_masks):
 ```{.python .input  n=18}
 def cls_eval(cls_preds, cls_labels):
    # 由于类别预测结果放在最后一维，argmax 需要指定最后一维。
-    return (cls_preds.argmax(axis=-1) == cls_labels).mean().asscalar()
+    return (cls_preds.argmax(axis=-1) == cls_labels).sum().asscalar()

 def bbox_eval(bbox_preds, bbox_labels, bbox_masks):
-    return ((bbox_labels - bbox_preds) * bbox_masks).abs().mean().asscalar()
+    return ((bbox_labels - bbox_preds) * bbox_masks).abs().sum().asscalar()
 ```

 ### 训练模型
@@ -236,10 +236,10 @@ def bbox_eval(bbox_preds, bbox_labels, bbox_masks):

 ```{.python .input  n=19}
 for epoch in range(20):
-    acc, mae = 0, 0
-    train_data.reset()  # 从头读取数据。
+    acc_sum, mae_sum, n, m = 0.0, 0.0, 0, 0
+    train_iter.reset()  # 从头读取数据。
    start = time.time()
-    for i, batch in enumerate(train_data):
+    for batch in train_iter:
        X = batch.data[0].as_in_context(ctx)
        Y = batch.label[0].as_in_context(ctx)
        with autograd.record():
@@ -253,11 +253,14 @@ for epoch in range(20):
                          bbox_masks)
        l.backward()
        trainer.step(batch_size)
-        acc += cls_eval(cls_preds, cls_labels)
-        mae += bbox_eval(bbox_preds, bbox_labels, bbox_masks)
+        acc_sum += cls_eval(cls_preds, cls_labels)
+        n += cls_labels.size
+        mae_sum += bbox_eval(bbox_preds, bbox_labels, bbox_masks)
+        m += bbox_labels.size
+
    if (epoch + 1) % 5 == 0:
        print('epoch %2d, class err %.2e, bbox mae %.2e, time %.1f sec' % (
-            epoch + 1, 1 - acc / (i + 1), mae / (i + 1), time.time() - start))
+            epoch + 1, 1 - acc_sum / n, mae_sum / m, time.time() - start))
 ```

 ## 预测

--- a/chapter_convolutional-neural-networks/batch-norm.md
+++ b/chapter_convolutional-neural-networks/batch-norm.md
@@ -34,17 +34,17 @@ $$\boldsymbol{\sigma}_\mathcal{B}^2 \leftarrow \frac{1}{m} \sum_{i=1}^{m}(\bolds

 $$\hat{\boldsymbol{x}}^{(i)} \leftarrow \frac{\boldsymbol{x}^{(i)} - \boldsymbol{\mu}_\mathcal{B}}{\sqrt{\boldsymbol{\sigma}_\mathcal{B}^2 + \epsilon}},$$

-这里$\epsilon > 0$是一个很小的常数，保证分母大于0。在上面标准化的基础上，批量归一化层引入了两个可以学习的模型参数，拉升（scale）参数 $\boldsymbol{\gamma}$ 和偏移（shift）参数 $\boldsymbol{\beta}$。这两个参数和$\boldsymbol{x}^{(i)}$形状相同，皆为$d$维向量。它们与$\boldsymbol{x}^{(i)}$分别做按元素乘法（符号$\odot$）和加法计算：
+这里$\epsilon > 0$是一个很小的常数，保证分母大于0。在上面标准化的基础上，批量归一化层引入了两个可以学习的模型参数，拉伸（scale）参数 $\boldsymbol{\gamma}$ 和偏移（shift）参数 $\boldsymbol{\beta}$。这两个参数和$\boldsymbol{x}^{(i)}$形状相同，皆为$d$维向量。它们与$\boldsymbol{x}^{(i)}$分别做按元素乘法（符号$\odot$）和加法计算：

 $${\boldsymbol{y}}^{(i)} \leftarrow \boldsymbol{\gamma} \odot \hat{\boldsymbol{x}}^{(i)} + \boldsymbol{\beta}.$$

 至此，我们得到了$\boldsymbol{x}^{(i)}$的批量归一化的输出$\boldsymbol{y}^{(i)}$。
-值得注意的是，可学习的拉升和偏移参数保留了不对$\hat{\boldsymbol{x}}^{(i)}$做批量归一化的可能：此时只需学出$\boldsymbol{\gamma} = \sqrt{\boldsymbol{\sigma}_\mathcal{B}^2 + \epsilon}$和$\boldsymbol{\beta} = \boldsymbol{\mu}_\mathcal{B}$。我们可以对此这样理解：如果批量归一化无益，理论上学出的模型可以不使用批量归一化。
+值得注意的是，可学习的拉伸和偏移参数保留了不对$\hat{\boldsymbol{x}}^{(i)}$做批量归一化的可能：此时只需学出$\boldsymbol{\gamma} = \sqrt{\boldsymbol{\sigma}_\mathcal{B}^2 + \epsilon}$和$\boldsymbol{\beta} = \boldsymbol{\mu}_\mathcal{B}$。我们可以对此这样理解：如果批量归一化无益，理论上学出的模型可以不使用批量归一化。


 ### 对卷积层做批量归一化

-对卷积层来说，批量归一化发生在卷积计算之后、应用激活函数之前。如果卷积计算输出多个通道，我们需要对这些通道的输出分别做批量归一化，且每个通道都拥有独立的拉升和偏移参数，且均为标量。设小批量中有$m$个样本。在单个通道上，假设卷积计算输出的高和宽分别为$p$和$q$。我们需要对该通道中$m \times p \times q$个元素同时做批量归一化。对这些元素做标准化计算时，我们使用相同的均值和方差，即该通道中$m \times p \times q$个元素的均值和方差。
+对卷积层来说，批量归一化发生在卷积计算之后、应用激活函数之前。如果卷积计算输出多个通道，我们需要对这些通道的输出分别做批量归一化，且每个通道都拥有独立的拉伸和偏移参数，且均为标量。设小批量中有$m$个样本。在单个通道上，假设卷积计算输出的高和宽分别为$p$和$q$。我们需要对该通道中$m \times p \times q$个元素同时做批量归一化。对这些元素做标准化计算时，我们使用相同的均值和方差，即该通道中$m \times p \times q$个元素的均值和方差。


 ### 预测时的批量归一化
@@ -82,11 +82,11 @@ def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
        # 更新移动平均的均值和方差。
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
-    Y = gamma * X_hat + beta  # 拉升和偏移。
+    Y = gamma * X_hat + beta  # 拉伸和偏移。
    return Y, moving_mean, moving_var
 ```

-接下来我们自定义一个`BatchNorm`层。它保存参与求梯度和迭代的拉升参数`gamma`和偏移参数`beta`，同时也维护移动平均得到的均值和方差，以能够在模型预测时使用。`BatchNorm`实例所需指定的`num_features`参数对于全连接层为输出个数，对于卷积层则为输出通道数。该实例所需指定的`num_dims`参数对于全连接层和卷积层分别为2和4。
+接下来我们自定义一个`BatchNorm`层。它保存参与求梯度和迭代的拉伸参数`gamma`和偏移参数`beta`，同时也维护移动平均得到的均值和方差，以能够在模型预测时使用。`BatchNorm`实例所需指定的`num_features`参数对于全连接层为输出个数，对于卷积层则为输出通道数。该实例所需指定的`num_dims`参数对于全连接层和卷积层分别为2和4。

 ```{.python .input  n=73}
 class BatchNorm(nn.Block):
@@ -96,7 +96,7 @@ class BatchNorm(nn.Block):
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
-        # 参与求梯度和迭代的拉升和偏移参数，分别初始化成 0 和 1。
+        # 参与求梯度和迭代的拉伸和偏移参数，分别初始化成 0 和 1。
        self.gamma = self.params.get('gamma', shape=shape, init=init.One())
        self.beta = self.params.get('beta', shape=shape, init=init.Zero())
        # 不参与求梯度和迭代的变量，全在 CPU 上初始化成 0。
@@ -148,7 +148,7 @@ train_iter, test_iter = gb.load_data_fashion_mnist(batch_size)
 gb.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
 ```

-最后我们查看下第一个批量归一化层学习到的拉升参数`gamma`和偏移参数`beta`。
+最后我们查看下第一个批量归一化层学习到的拉伸参数`gamma`和偏移参数`beta`。

 ```{.python .input  n=60}
 net[1].gamma.data().reshape((-1,)), net[1].beta.data().reshape((-1,))

--- a/chapter_convolutional-neural-networks/lenet.md
+++ b/chapter_convolutional-neural-networks/lenet.md
@@ -81,12 +81,13 @@ ctx
 # 本函数已保存在 gluonbook 包中方便以后使用。该函数将被逐步改进：它的完整实现将在“图像增
 # 广”一节中描述。
 def evaluate_accuracy(data_iter, net, ctx):
-    acc = nd.array([0], ctx=ctx)
+    acc_sum, n = nd.array([0], ctx=ctx), 0
    for X, y in data_iter:
        # 如果 ctx 代表 GPU 及相应的显存，将数据复制到显存上。
-        X, y = X.as_in_context(ctx), y.as_in_context(ctx)
-        acc += gb.accuracy(net(X), y)
-    return acc.asscalar() / len(data_iter)
+        X, y = X.as_in_context(ctx), y.as_in_context(ctx).astype('float32')
+        acc_sum += (net(X).argmax(axis=1) == y).sum()
+        n += y.size
+    return acc_sum.asscalar() / n
 ```

 我们同样对[“Softmax回归的从零开始实现”](../chapter_deep-learning-basics/softmax-regression-scratch.md)一节中定义的`train_ch3`函数略作修改，确保计算使用的数据和模型同在内存或显存上。
@@ -98,21 +99,23 @@ def train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx,
    print('training on', ctx)
    loss = gloss.SoftmaxCrossEntropyLoss()
    for epoch in range(num_epochs):
-        train_l_sum, train_acc_sum, start = 0, 0, time.time()
+        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X, y = X.as_in_context(ctx), y.as_in_context(ctx)
            with autograd.record():
                y_hat = net(X)
-                l = loss(y_hat, y)
+                l = loss(y_hat, y).sum()
            l.backward()
            trainer.step(batch_size)
-            train_l_sum += l.mean().asscalar()
-            train_acc_sum += gb.accuracy(y_hat, y)
+            y = y.astype('float32')
+            train_l_sum += l.asscalar()
+            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
+            n += y.size
        test_acc = evaluate_accuracy(test_iter, net, ctx)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, '
-              'time %.1f sec' % (epoch + 1, train_l_sum / len(train_iter),
-                                 train_acc_sum / len(train_iter),
-                                 test_acc, time.time() - start))
+              'time %.1f sec'
+              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc,
+                 time.time() - start))
 ```

 我们重新将模型参数初始化到设备变量`ctx`之上，并使用Xavier随机初始化。损失函数和训练算法则依然使用交叉熵损失函数和小批量随机梯度下降。

--- a/chapter_deep-learning-basics/kaggle-house-price.md
+++ b/chapter_deep-learning-basics/kaggle-house-price.md
@@ -195,7 +195,6 @@ def k_fold(k, X_train, y_train, num_epochs,

 ```{.python .input  n=16}
 k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
-verbose_epoch = num_epochs - 2
 train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
                          weight_decay, batch_size)
 print('%d-fold validation: avg train rmse: %f, avg valid rmse: %f'

--- a/chapter_deep-learning-basics/softmax-regression-scratch.md
+++ b/chapter_deep-learning-basics/softmax-regression-scratch.md
@@ -77,7 +77,7 @@ def net(X):

 ```{.python .input  n=9}
 y_hat = nd.array([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
-y = nd.array([0, 2])
+y = nd.array([0, 2], dtype='int32')
 nd.pick(y_hat, y)
 ```

@@ -92,10 +92,9 @@ def cross_entropy(y_hat, y):

 给定一个类别的预测概率分布`y_hat`，我们把预测概率最大的类别作为输出类别。如果它与真实类别`y`一致，说明这次预测是正确的。分类准确率即正确预测数量与总预测数量之比。

-下面定义准确率`accuracy`函数。其中`y_hat.argmax(axis=1)`返回矩阵`y_hat`每行中最大元素的索引，且返回结果与变量`y`形状相同。我们在[“数据操作”](../chapter_prerequisite/ndarray.md)一节介绍过，相等条件判断式`(y_hat.argmax(axis=1) == y)`是一个值为0（相等为假）或1（相等为真）的NDArray。由于标签类型为整数，我们先将变量`y`变换为浮点数再进行相等条件判断。
+为了演示准确率的计算，下面定义准确率`accuracy`函数。其中`y_hat.argmax(axis=1)`返回矩阵`y_hat`每行中最大元素的索引，且返回结果与变量`y`形状相同。我们在[“数据操作”](../chapter_prerequisite/ndarray.md)一节介绍过，相等条件判断式`(y_hat.argmax(axis=1) == y)`是一个值为0（相等为假）或1（相等为真）的NDArray。由于标签类型为整数，我们先将变量`y`变换为浮点数再进行相等条件判断。

 ```{.python .input  n=11}
-# 本函数已保存在 gluonbook 包中方便以后使用。
 def accuracy(y_hat, y):
    return (y_hat.argmax(axis=1) == y.astype('float32')).mean().asscalar()
 ```
@@ -112,10 +111,12 @@ accuracy(y_hat, y)
 # 本函数已保存在 gluonbook 包中方便以后使用。该函数将被逐步改进：它的完整实现将在“图像增
 # 广”一节中描述。
 def evaluate_accuracy(data_iter, net):
-    acc = 0
+    acc_sum, n = 0.0, 0
    for X, y in data_iter:
-        acc += accuracy(net(X), y)
-    return acc / len(data_iter)
+        y = y.astype('float32')
+        acc_sum += (net(X).argmax(axis=1) == y).sum().asscalar()
+        n += y.size
+    return acc_sum / n
 ```

 因为我们随机初始化了模型`net`，所以这个随机模型的准确率应该接近于类别个数10的倒数0.1。
@@ -135,23 +136,23 @@ num_epochs, lr = 5, 0.1
 def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
              params=None, lr=None, trainer=None):
    for epoch in range(num_epochs):
-        train_l_sum = 0
-        train_acc_sum = 0
+        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            with autograd.record():
                y_hat = net(X)
-                l = loss(y_hat, y)
+                l = loss(y_hat, y).sum()
            l.backward()
            if trainer is None:
                gb.sgd(params, lr, batch_size)
            else:
                trainer.step(batch_size)  # 下一节将用到。
-            train_l_sum += l.mean().asscalar()
-            train_acc_sum += accuracy(y_hat, y)
+            y = y.astype('float32')
+            train_l_sum += l.asscalar()
+            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
+            n += y.size
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
-              % (epoch + 1, train_l_sum / len(train_iter),
-                 train_acc_sum / len(train_iter), test_acc))
+              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))

 train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs,
          batch_size, [W, b], lr)

--- a/chapter_introduction/deep-learning-intro.md
+++ b/chapter_introduction/deep-learning-intro.md
@@ -87,7 +87,7 @@

 * 游戏曾被认为是人类智能最后的堡垒。自使用时间差分强化学习玩双陆棋的TD-Gammon之始，算法和算力的发展催生了一系列在游戏上使用的新算法。与双陆棋不同，国际象棋有更复杂的状态空间和更多的可选动作。“深蓝”用大量的并行、专用硬件和游戏树的高效搜索打败了加里·卡斯帕罗夫 [17]。围棋因其庞大的状态空间被认为是更难的游戏，AlphaGo在2016年用结合深度学习与蒙特卡洛树采样的方法达到了人类水准 [18]。对德州扑克游戏而言，除了巨大的状态空间之外，更大的挑战是游戏的信息并不完全可见，例如看不到对手的牌。而“冷扑大师”用高效的策略体系超越了人类玩家的表现 [19]。以上的例子都体现出了先进的算法是人工智能在游戏上的表现提升的重要原因。

-* 机器学习的崛起促使自动驾驶领域空前发展。尽管距离完全的自主驾驶还有很长的路要走，但诸如[Momenta](http://www.momenta.com)、[Tesla](http://www.tesla.com)、[NVIDIA](http://www.nvidia.com)、 [MobilEye](http://www.mobileye.com)和[Waymo](http://www.waymo.com)这样的公司交出的具有部分自主驾驶功能的产品展示出了这个领域巨大的进步。完全自主驾驶的难点在于它需要将感知、思考和规则整合在同一个系统中。目前，深度学习主要被应用在计算机视觉的部分，剩余的部分还是需要工程师们的大量调试。
+* 机器学习的崛起促使自动驾驶领域空前发展。尽管距离完全的自主驾驶还有很长的路要走，但诸如[Momenta](http://www.momenta.ai)、[Tesla](http://www.tesla.com)、[NVIDIA](http://www.nvidia.com)、 [MobilEye](http://www.mobileye.com)和[Waymo](http://www.waymo.com)这样的公司交出的具有部分自主驾驶功能的产品展示出了这个领域巨大的进步。完全自主驾驶的难点在于它需要将感知、思考和规则整合在同一个系统中。目前，深度学习主要被应用在计算机视觉的部分，剩余的部分还是需要工程师们的大量调试。

 以上列出的仅仅是近年来深度学习所取得的成果的冰山一角。机器人学、物流管理、计算生物学、粒子物理学和天文学近年来的发展也有部分要归功于深度学习。可以看到，深度学习已经逐渐演变成一个工程师和科学家皆可使用的普适工具。


--- a/chapter_introduction/preface.md
+++ b/chapter_introduction/preface.md
@@ -53,7 +53,7 @@

 ## 致谢

-我们无比感谢本书的中英文版稿件贡献者和论坛用户们。他们帮助增添或改进了书中内容并提供了有价值的反馈。特别地，我们要感谢每一位为这本中文版开源书提交内容改动的贡献者们。这些贡献者的Github用户名和姓名（如提供）是：aa12356jm（崔永明）、aaronzs（Aaron Sun）、AceCoooool（陈斌斌）、alues（曾元豪）、Andiedie（周长安）、Angzz（李昂）、cgraywang（王晨光）、ChaiBapchya（Chaitanya Prakash Bapat）、chiyahoho（金杰）、daizuozhuo（戴作卓）、danteliujie（刘捷）、daquexian（张建浩）、DarkWings520（梓善）、delphi-tang（唐佐林）、DHRUV536、ding-hai（丁海）、DL-85（郭晶博）、duanhong169（段弘）、elliotxx（杨英明）、eric-haibin-lin（林海滨）、Evensgn（范舟）、fcbruce（李律）、Feywell（李阳）、fierceX（夏鲁豫）、foreversailor（张鹏）、gcaxuxi（徐曦）、Ghostish（Kangel Zenn）、GYingchao（Richard CUI）、gyp03（郭云鹏）、hank123456、haojin2（金颢）、hardfish82、hetong007（何通）、HITjialanyu（高剑伟 ）、hlnull（王海龙）、htoooth、hufuyu、hukun01（Kun Hu）、ibyte2011（刘俊朋）、icemelon9（沈海晨）、inkydragon（韩承宇）、Jerryzcn（张钟越）、Jing-Luo（罗晶）、jiqirer（jiqirer）、Jonariguez（贾忠祥）、jwwthu（姜蔚蔚）、kaonashi-tyc（田宇琛）、kevinthesun（王曜）、kli-nlpr（李凯）、lanking520（兰青）、Laurawly（王乐园）、leezu（Leonard Lausen）、leizhag（张雷）、leocvml（鄭宇翔）、linbojin、lingss0918、LinkHS（杨大卫）、liujia1（刘佳）、loveisp（戴玮）、MachineIntellect（贾老坏）、mingloo（陆明）、MoodMAX（张亚鹏）、mzchtx（李超）、nlpjoe（周俊佐）、noobbull（Liang Jinzheng）、omg2hei（童话）、oneTaken（彭小平）、PEGASUS1993（王皓）、pengyuanzhuo（彭远卓）、PeterHuang2015（黄焖鸡）、piiswrong（解浚源）、Ramlinbird（彭艺宇）、rebounding（刘铭）、reminisce（吴俊）、rliu054（刘睿）、rongruosong（张绍明）、SnailTyan（刘天池）、starsdeep（廖翊康）、sxjscience（施行健）、SyunSiu（孙畔勇）、szha（查晟）、szhengac（郑帅）、Tom-Ren（任杰骥）、wanghaizhen（王海珍）、wangx404（王鑫）、wangzhe258369、wangzhenhui1992（王振荟）、WenmuZhou（周军）、wkcn（吴侃）、wlbksy（汪磊）、wudayo、xcnick（徐驰）、XiaGenYuan（夏根源）、xiaotinghe（何孝霆）、XieGuochao（谢国超）、xinetzone（刘新伟）、xmfbit（肖梅峰）、xwind-h（黄晓烽）、yanwenlei（燕文磊）、yidawang（王贻达）、yifeim（马逸飞）、yixuan（邱怡轩）、yongwww（吴勇）、ypwhs（杨培文）、yufengwhy（余峰）、yupbank（Peng Yu）、yuweiw823（王雨薇）、yuxiangw（王宇翔）、yxyphoebe（喻心悦）、yzhao30（赵越）、yzhliu（刘忆智）、zhanghang1989（张航）、zheng-da（郑达）、zhiics（陈志）、zhouhang95（周航）、zhreshold（张帜）、zijie0（周远）。谢谢你们为每一位读者改进这本开源书。
+我们无比感谢本书的中英文版稿件贡献者和论坛用户们。他们帮助增添或改进了书中内容并提供了有价值的反馈。特别地，我们要感谢每一位为这本中文版开源书提交内容改动的贡献者们。这些贡献者的Github用户名和姓名（如提供）是：1000Delta（许致中）、aa12356jm（崔永明）、aaronzs（Aaron Sun）、AceCoooool（陈斌斌）、alues（曾元豪）、Andiedie（周长安）、Angzz（李昂）、cgraywang（王晨光）、ChaiBapchya（Chaitanya Prakash Bapat）、chiyahoho（金杰）、cholesky01（赵小华）、daizuozhuo（戴作卓）、danteliujie（刘捷）、daquexian（张建浩）、DarkWings520（梓善）、delphi-tang（唐佐林）、DHRUV536、ding-hai（丁海）、DL-85（郭晶博）、duanhong169（段弘）、elliotxx（杨英明）、eric-haibin-lin（林海滨）、Evensgn（范舟）、fcbruce（李律）、Feywell（李阳）、fierceX（夏鲁豫）、foreversailor（张鹏）、gcaxuxi（徐曦）、Ghostish（Kangel Zenn）、GYingchao（Richard CUI）、gyp03（郭云鹏）、hank123456、haojin2（金颢）、hardfish82、hetong007（何通）、HITjialanyu（高剑伟）、hlnull（王海龙）、htoooth、hufuyu、hukun01（Kun Hu）、ibyte2011（刘俊朋）、icemelon9（沈海晨）、inkydragon（韩承宇）、Jerryzcn（张钟越）、Jing-Luo（罗晶）、jiqirer（jiqirer）、Jonariguez（贾忠祥）、jwwthu（姜蔚蔚）、kaonashi-tyc（田宇琛）、kevinthesun（王曜）、kli-nlpr（李凯）、lanking520（兰青）、Laurawly（王乐园）、leezu（Leonard Lausen）、leizhag（张雷）、leocvml（鄭宇翔）、linbojin、lingss0918、LinkHS（杨大卫）、liujia1（刘佳）、loveisp（戴玮）、MachineIntellect（贾老坏）、mingloo（陆明）、MoodMAX（张亚鹏）、mzchtx（李超）、nlpjoe（周俊佐）、noobbull（Liang Jinzheng）、omg2hei（童话）、oneTaken（彭小平）、PEGASUS1993（王皓）、PenghuahuaPeng（彭大发）、pengyuanzhuo（彭远卓）、PeterHuang2015（黄焖鸡）、piiswrong（解浚源）、Ramlinbird（彭艺宇）、rebounding（刘铭）、reminisce（吴俊）、rliu054（刘睿）、rongruosong（张绍明）、scalerela（施洪）、SnailTyan（刘天池）、starsdeep（廖翊康）、sxjscience（施行健）、SyunSiu（孙畔勇）、szha（查晟）、szhengac（郑帅）、Tom-Ren（任杰骥）、wanghaizhen（王海珍）、wangx404（王鑫）、wangzhe258369、wangzhenhui1992（王振荟）、WenmuZhou（周军）、wkcn（吴侃）、wlbksy（汪磊）、wudayo、xcnick（徐驰）、XiaGenYuan（夏根源）、xiaotinghe（何孝霆）、XieGuochao（谢国超）、xinetzone（刘新伟）、xmfbit（肖梅峰）、xwind-h（黄晓烽）、yanwenlei（燕文磊）、yidawang（王贻达）、yifeim（马逸飞）、yixuan（邱怡轩）、yongwww（吴勇）、ypwhs（杨培文）、yufengwhy（余峰）、yupbank（Peng Yu）、yuweiw823（王雨薇）、yuxiangw（王宇翔）、yxyphoebe（喻心悦）、yzhao30（赵越）、yzhliu（刘忆智）、zhanghang1989（张航）、zheng-da（郑达）、zhiics（陈志）、zhouhang95（周航）、zhreshold（张帜）、zijie0（周远）。谢谢你们为每一位读者改进这本开源书。

 此外，我们感谢Amazon Web Services，特别是Swami Sivasubramanian、Raju Gulabani、Charlie Bell和Andrew Jassy在我们撰写本书时给予的慷慨支持。如果没有可用的时间、资源以及来自同事们的讨论和鼓励，就没有这本书的项目。我们还要感谢Apache MXNet团队实现了很多本书所使用的特性。另外，经过同事们的校勘，本书的质量得到了极大的提升。在此我们一一列出章节和校勘人，以表示我们由衷的感谢：引言的校勘人为金颢，预备知识的校勘人为吴俊，深度学习基础的校勘人为张航、王晨光、林海滨，深度学习计算的校勘人为查晟，卷积神经网络的校勘人为张帜、何通，循环神经网络的校勘人为查晟，优化算法的校勘人为郑帅，计算性能的校勘人为郑达、吴俊，计算机视觉的校勘人为解浚源、张帜、何通、张航，自然语言处理的校勘人为王晨光，附录的校勘人为金颢。

@@ -64,7 +64,7 @@

 ## 教学资源和反馈

-本书的英文版 Dive into Deep Learning 将被用作伯克利加州大学2019年春学期“Introduction to Deep Learning”课程的教材。相关教学资源（课件、视频、更多习题等）将会在网上公布。诚然，将算法、公式、图片、代码和样例统一进一本适合阅读的书，而且又是一系列有交互式体验的 Jupyter 记事本，是对我们极大的挑战。书中难免有很多疏忽的地方，敬请大家原谅，并希望你能通过每一节后面的二维码向我们反馈问题。
+本书的英文版 Dive into Deep Learning 将被用作加州大学伯克利分校2019年春学期“Introduction to Deep Learning”课程的教材。相关教学资源（课件、视频、更多习题等）将会在网上公布。诚然，将算法、公式、图片、代码和样例统一进一本适合阅读的书，而且又是一系列有交互式体验的 Jupyter 记事本，是对我们极大的挑战。书中难免有很多疏忽的地方，敬请大家原谅，并希望你能通过每一节后面的二维码向我们反馈问题。

 结尾处，附上陆游的一句诗作为勉励：


--- a/chapter_natural-language-processing/attention.md
+++ b/chapter_natural-language-processing/attention.md
@@ -73,6 +73,13 @@ $$

 其中含下标的$\boldsymbol{W}$和$\boldsymbol{b}$分别为门控循环单元的权重参数和偏差参数。

+
+
+## 发展
+
+本质上，注意力机制能够为特征中较有价值的部分分配较多的计算资源。这个有趣的想法自提出后得到了快速发展，特别是启发了依靠注意力机制来编码输入序列并解码出输出序列的变换器（Transformer）模型的设计 [2]。变换器抛弃了卷积神经网络和循环神经网络的架构。它在计算效率上比基于循环神经网络的编码器—解码器模型通常更具明显优势。含注意力机制的变换器的编码结构在后来的BERT预训练模型中得以应用并令后者大放异彩：微调后的模型在多达11项自然语言处理任务中取得了当时最先进的结果 [3]。除了自然语言处理领域，注意力机制还被广泛用于图像分类、自动图像描述、唇语解读以及语音识别。
+
+
 ## 小结

 * 我们可以在解码器的每个时间步使用不同的背景变量，并对输入序列中不同时间步编码的信息分配不同的注意力。
@@ -96,3 +103,7 @@ $$
 ## 参考文献

 [1] Bahdanau, D., Cho, K., & Bengio, Y. (2014). Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473.
+
+[2] Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., ... & Polosukhin, I. (2017). Attention is all you need. In Advances in Neural Information Processing Systems (pp. 5998-6008).
+
+[3] Devlin, J., Chang, M. W., Lee, K., & Toutanova, K. (2018). Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805.
--- a/chapter_natural-language-processing/beam-search.md
+++ b/chapter_natural-language-processing/beam-search.md
@@ -13,7 +13,7 @@ $$y_{t'} = \operatorname*{argmax}_{y \in \mathcal{Y}} \mathbb{P}(y \mid y_1, \ld

 作为输出。一旦搜索出“&lt;eos&gt;”符号，或者输出序列长度已经达到了最大长度$T'$，便完成输出。

-我们在描述解码器是提到，基于输入序列生成输出序列的条件概率是$\prod_{t'=1}^{T'} \mathbb{P}(y_{t'} \mid y_1, \ldots, y_{t'-1}, \boldsymbol{c})$。我们将该条件概率最大的输出序列称为最优序列。而贪婪搜索的主要问题是不能保证得到最优序列。
+我们在描述解码器时提到，基于输入序列生成输出序列的条件概率是$\prod_{t'=1}^{T'} \mathbb{P}(y_{t'} \mid y_1, \ldots, y_{t'-1}, \boldsymbol{c})$。我们将该条件概率最大的输出序列称为最优序列。而贪婪搜索的主要问题是不能保证得到最优序列。

 下面我们来看一个例子。假设输出词典里面有“A”、“B”、“C”和“&lt;eos&gt;”这四个词。图10.9中每个时间步下的四个数字分别代表了该时间步生成“A”、“B”、“C”和“&lt;eos&gt;”这四个词的条件概率。在每个时间步，贪婪搜索选取条件概率最大的词。因此，图10.9中将生成输出序列“A”、“B”、“C”、“&lt;eos&gt;”。该输出序列的条件概率是$0.5\times0.4\times0.4\times0.6 = 0.048$。


--- a/chapter_natural-language-processing/glove.md
+++ b/chapter_natural-language-processing/glove.md
@@ -42,7 +42,7 @@ $$\sum_{i\in\mathcal{V}} \sum_{j\in\mathcal{V}} h(x_{ij}) \left(\boldsymbol{u}_j
 我们还可以从另外一个角度来理解GloVe词嵌入。沿用本节前面的符号，$\mathbb{P}(w_j \mid w_i)$表示数据集中以$w_i$为中心词生成背景词$w_j$的条件概率，并记作$p_{ij}$。作为源于某大型语料库的真实例子，以下列举了两组分别以“ice”（“冰”）和“steam”（“蒸汽”）为中心词的条件概率以及它们之间的比值 [1]：

 |$w_k$=|“solid”|“gas”|“water”|“fashion”|
-|--:|:-:|:-:|:-:|
+|--:|:-:|:-:|:-:|:-:|
 |$p_1=\mathbb{P}(w_k\mid$ “ice” $)$|0.00019|0.000066|0.003|0.000017|
 |$p_2=\mathbb{P}(w_k\mid$ “steam” $)$|0.000022|0.00078|0.0022|0.000018|
 |$p_1/p_2$|8.9|0.085|1.36|0.96|
@@ -51,7 +51,7 @@ $$\sum_{i\in\mathcal{V}} \sum_{j\in\mathcal{V}} h(x_{ij}) \left(\boldsymbol{u}_j
 我们可以观察到以下现象：

 * 对于与“ice”相关而与“steam”不相关的词$w_k$，例如$w_k=$“solid”（“固体”），我们期望条件概率比值较大，例如上表最后一行中的值8.9；
-* 对于与“ice”不相关而与steam相关的词$w_k$，例如$w_k=$“gas”（“气体”），我们期望条件概率比值较小，例如上表最后一行中的值0.085；
+* 对于与“ice”不相关而与“steam”相关的词$w_k$，例如$w_k=$“gas”（“气体”），我们期望条件概率比值较小，例如上表最后一行中的值0.085；
 * 对于与“ice”和“steam”都相关的词$w_k$，例如$w_k=$“water”（“水”），我们期望条件概率比值接近1，例如上表最后一行中的值1.36；
 * 对于与“ice”和“steam”都不相关的词$w_k$，例如$w_k=$“fashion”（“时尚”），我们期望条件概率比值接近1，例如上表最后一行中的值0.96。


--- a/chapter_natural-language-processing/index.md
+++ b/chapter_natural-language-processing/index.md
 # 自然语言处理

-自然语言处理关注计算机与人类之间的自然语言交互。在实际中，我们常常使用自然语言处理技术，例如“循环神经网络”一章中介绍的语言模型，来处理和分析大量的自然语言数据。
+自然语言处理关注计算机与人类之间的自然语言交互。在实际中，我们常常使用自然语言处理技术，如“循环神经网络”一章中介绍的语言模型，来处理和分析大量的自然语言数据。本章中，根据输入与输出的不同形式，我们按“定长到定长”、“不定长到定长”、“不定长到不定长”的顺序，逐步展示在自然语言处理中如何表征并变换定长的词或类别，以及不定长的句子或段落序列。

-本章中，我们将先介绍如何用向量表示词，并在语料库上训练词向量。我们还将应用在更大语料库上预训练的词向量求近义词和类比词。接着，在文本分类任务中，我们进一步应用词向量分析文本情感，并分别基于循环神经网络和卷积神经网络讲解时序数据分类的两种重要思路。此外，自然语言处理任务中很多输出是不定长的，例如任意长度的句子。我们将描述应对这类问题的编码器—解码器模型、束搜索和注意力机制，并将它们应用于机器翻译中。
+我们先介绍如何用向量表示词，并在语料库上训练词向量。之后，我们把在更大语料库上预训练的词向量应用于求近义词和类比词，即“定长到定长”。接着，在文本分类这种“不定长到定长”的任务中，我们进一步应用词向量来分析文本情感，并分别基于循环神经网络和卷积神经网络为表征时序数据提供两种思路。此外，自然语言处理任务中很多输出是不定长的，如任意长度的句子或段落。我们将描述应对这类问题的编码器—解码器模型、束搜索和注意力机制，并动手实践“不定长到不定长”的机器翻译任务。

 ```eval_rst


--- a/chapter_natural-language-processing/machine-translation.md
+++ b/chapter_natural-language-processing/machine-translation.md
@@ -213,7 +213,7 @@ def train(encoder, decoder, dataset, lr, batch_size, num_epochs):
    loss = gloss.SoftmaxCrossEntropyLoss()
    data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True)
    for epoch in range(num_epochs):
-        l_sum = 0
+        l_sum = 0.0
        for X, Y in data_iter:
            with autograd.record():
                l = batch_loss(encoder, decoder, X, Y, loss)

--- a/chapter_natural-language-processing/word2vec-gluon.md
+++ b/chapter_natural-language-processing/word2vec-gluon.md
@@ -291,7 +291,7 @@ def train(net, lr, num_epochs):
    trainer = gluon.Trainer(net.collect_params(), 'adam',
                            {'learning_rate': lr})
    for epoch in range(num_epochs):
-        start_time, train_l_sum = time.time(), 0
+        start, l_sum, n = time.time(), 0.0, 0
        for batch in data_iter:
            center, context_negative, mask, label = [
                data.as_in_context(ctx) for data in batch]
@@ -302,10 +302,10 @@ def train(net, lr, num_epochs):
                     mask.shape[1] / mask.sum(axis=1))
            l.backward()
            trainer.step(batch_size)
-            train_l_sum += l.mean().asscalar()
-        print('epoch %d, train loss %.2f, time %.2fs'
-              % (epoch + 1, train_l_sum / len(data_iter),
-                 time.time() - start_time))
+            l_sum += l.sum().asscalar()
+            n += l.size
+        print('epoch %d, loss %.2f, time %.2fs'
+              % (epoch + 1, l_sum / n, time.time() - start))
 ```

 现在我们可以训练使用负采样的跳字模型了。

--- a/chapter_recurrent-neural-networks/bptt.md
+++ b/chapter_recurrent-neural-networks/bptt.md
@@ -2,7 +2,7 @@

 如果你做了上一节的练习，你会发现，如果不裁剪梯度，模型将无法正常训练。为了深刻理解这一现象，本节将介绍循环神经网络中梯度的计算和存储方法，即通过时间反向传播（back-propagation through time）。

-我们在[“正向传播、反向传播和计算图”](../chapter_deep-learning-basics/backprop.md)一节中介绍了神经网络中梯度计算与存储的一般思路，并强调正向传播和反向传播相互依赖。正向传播在循环神经网络比较直观。通过时间反向传播其实是反向传播在循环神经网络中的具体应用。我们需要将循环神经网络按时间步展开，从而得到模型变量和参数之间的依赖关系，并依据链式法则应用反向传播计算并存储梯度。
+我们在[“正向传播、反向传播和计算图”](../chapter_deep-learning-basics/backprop.md)一节中介绍了神经网络中梯度计算与存储的一般思路，并强调正向传播和反向传播相互依赖。正向传播在循环神经网络中比较直观，而通过时间反向传播其实是反向传播在循环神经网络中的具体应用。我们需要将循环神经网络按时间步展开，从而得到模型变量和参数之间的依赖关系，并依据链式法则应用反向传播计算并存储梯度。


 ## 定义模型

--- a/chapter_recurrent-neural-networks/lang-model-dataset.md
+++ b/chapter_recurrent-neural-networks/lang-model-dataset.md
@@ -17,7 +17,7 @@ with zipfile.ZipFile('../data/jaychou_lyrics.txt.zip') as zin:
 corpus_chars[:40]
 ```

-这个数据集有五万多个字符。为了打印方便，我们把换行符替换成空格，然后仅使用前一万个字符来训练模型。
+这个数据集有六万多个字符。为了打印方便，我们把换行符替换成空格，然后仅使用前一万个字符来训练模型。

 ```{.python .input  n=14}
 corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')

--- a/chapter_recurrent-neural-networks/rnn-gluon.md
+++ b/chapter_recurrent-neural-networks/rnn-gluon.md
@@ -108,11 +108,11 @@ def train_and_predict_rnn_gluon(model, num_hiddens, vocab_size, ctx,
                            {'learning_rate': lr, 'momentum': 0, 'wd': 0})

    for epoch in range(num_epochs):
-        loss_sum, start = 0.0, time.time()
+        l_sum, n, start = 0.0, 0, time.time()
        data_iter = gb.data_iter_consecutive(
            corpus_indices, batch_size, num_steps, ctx)
        state = model.begin_state(batch_size=batch_size, ctx=ctx)
-        for t, (X, Y) in enumerate(data_iter):
+        for X, Y in data_iter:
            for s in state:
                s.detach()
            with autograd.record():
@@ -124,15 +124,16 @@ def train_and_predict_rnn_gluon(model, num_hiddens, vocab_size, ctx,
            params = [p.data() for p in model.collect_params().values()]
            gb.grad_clipping(params, clipping_theta, ctx)
            trainer.step(1)  # 因为已经误差取过均值，梯度不用再做平均。
-            loss_sum += l.asscalar()
+            l_sum += l.asscalar() * y.size
+            n += y.size

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
-                epoch + 1, math.exp(loss_sum / (t + 1)), time.time() - start))
+                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn_gluon(
-                    prefix, pred_len, model, vocab_size,
-                    ctx, idx_to_char, char_to_idx))
+                    prefix, pred_len, model, vocab_size, ctx, idx_to_char,
+                    char_to_idx))
 ```

 使用和上一节实验中一样的超参数来训练模型。

--- a/chapter_recurrent-neural-networks/rnn-scratch.md
+++ b/chapter_recurrent-neural-networks/rnn-scratch.md
@@ -134,7 +134,7 @@ $$ \min\left(\frac{\theta}{\|\boldsymbol{g}\|}, 1\right)\boldsymbol{g}$$
 ```{.python .input  n=10}
 # 本函数已保存在 gluonbook 包中方便以后使用。
 def grad_clipping(params, theta, ctx):
-    norm = nd.array([0.0], ctx)
+    norm = nd.array([0], ctx)
    for param in params:
        norm += (param.grad ** 2).sum()
    norm = norm.sqrt().asscalar()
@@ -180,9 +180,9 @@ def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
    for epoch in range(num_epochs):
        if not is_random_iter:  # 如使用相邻采样，在 epoch 开始时初始化隐藏状态。
            state = init_rnn_state(batch_size, num_hiddens, ctx)
-        loss_sum, start = 0.0, time.time()
+        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
-        for t, (X, Y) in enumerate(data_iter):
+        for X, Y in data_iter:
            if is_random_iter:  # 如使用随机采样，在每个小批量更新前初始化隐藏状态。
                state = init_rnn_state(batch_size, num_hiddens, ctx)
            else:  # 否则需要使用 detach 函数从计算图分离隐藏状态。
@@ -202,11 +202,12 @@ def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
            l.backward()
            grad_clipping(params, clipping_theta, ctx)  # 裁剪梯度。
            gb.sgd(params, lr, 1)  # 因为误差已经取过均值，梯度不用再做平均。
-            loss_sum += l.asscalar()
+            l_sum += l.asscalar() * y.size
+            n += y.size

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
-                epoch + 1, math.exp(loss_sum / (t + 1)), time.time() - start))
+                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn(
                    prefix, pred_len, rnn, params, init_rnn_state,

--- a/data/aclImdb_tiny.zip
+++ b/data/aclImdb_tiny.zip
--- a/environment.yml
+++ b/environment.yml
@@ -5,6 +5,5 @@ dependencies:
 - matplotlib=2.2.2
 - pandas=0.23.2
 - pip:
-  - requests==2.18.4
  - mxnet==1.5.0b20181215
-  - gluonbook==0.8.8
+  - gluonbook==0.8.10
--- a/gluonbook/__init__.py
+++ b/gluonbook/__init__.py
-
 from .utils import *

-__version__ = '0.8.8'
+__version__ = '0.8.10'
--- a/gluonbook/utils.py
+++ b/gluonbook/utils.py
@@ -30,11 +30,6 @@ VOC_COLORMAP = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
                [0, 64, 128]]


-def accuracy(y_hat, y):
-    """Get accuracy."""
-    return (y_hat.argmax(axis=1) == y.astype('float32')).mean().asscalar()
-
-
 def bbox_to_rect(bbox, color):
    """Convert bounding box to matplotlib format."""
    return plt.Rectangle(xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0],
@@ -156,16 +151,15 @@ def evaluate_accuracy(data_iter, net, ctx=[mx.cpu()]):
    """Evaluate accuracy of a model on the given data set."""
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
-    acc = nd.array([0])
-    n = 0
+    acc_sum, n = nd.array([0]), 0
    for batch in data_iter:
        features, labels, _ = _get_batch(batch, ctx)
        for X, y in zip(features, labels):
            y = y.astype('float32')
-            acc += (net(X).argmax(axis=1) == y).sum().copyto(mx.cpu())
+            acc_sum += (net(X).argmax(axis=1) == y).sum().copyto(mx.cpu())
            n += y.size
-        acc.wait_to_read()
-    return acc.asscalar() / n
+        acc_sum.wait_to_read()
+    return acc_sum.asscalar() / n


 def _get_batch(batch, ctx):
@@ -174,8 +168,7 @@ def _get_batch(batch, ctx):
    if labels.dtype != features.dtype:
        labels = labels.astype(features.dtype)
    return (gutils.split_and_load(features, ctx),
-            gutils.split_and_load(labels, ctx),
-            features.shape[0])
+            gutils.split_and_load(labels, ctx), features.shape[0])


 def get_data_ch7():
@@ -209,7 +202,7 @@ def get_vocab_imdb(data):
 def grad_clipping(params, theta, ctx):
    """Clip the gradient."""
    if theta is not None:
-        norm = nd.array([0.0], ctx)
+        norm = nd.array([0], ctx)
        for param in params:
            norm += (param.grad ** 2).sum()
        norm = norm.sqrt().asscalar()
@@ -490,6 +483,7 @@ def show_bboxes(axes, bboxes, labels=None, colors=None):


 def show_fashion_mnist(images, labels):
+    """Plot Fashion-MNIST images with labels."""
    use_svg_display()
    _, figs = plt.subplots(1, len(images), figsize=(12, 12))
    for f, img, lbl in zip(figs, images, labels):
@@ -512,6 +506,7 @@ def show_images(imgs, num_rows, num_cols, scale=2):


 def show_trace_2d(f, res):
+    """Show the trace of 2d variables during optimization."""
    x1, x2 = zip(*res)
    set_figsize()
    plt.plot(x1, x2, '-o', color='#ff7f0e')
@@ -538,9 +533,8 @@ def train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs):
    print('training on', ctx)
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
-    for epoch in range(1, num_epochs + 1):
-        train_l_sum, train_acc_sum, n, m = 0.0, 0.0, 0.0, 0.0
-        start = time.time()
+    for epoch in range(num_epochs):
+        train_l_sum, train_acc_sum, n, m, start = 0.0, 0.0, 0, 0, time.time()
        for i, batch in enumerate(train_iter):
            Xs, ys, batch_size = _get_batch(batch, ctx)
            ls = []
@@ -549,21 +543,21 @@ def train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs):
                ls = [loss(y_hat, y) for y_hat, y in zip(y_hats, ys)]
            for l in ls:
                l.backward()
+            trainer.step(batch_size)
+            train_l_sum += sum([l.sum().asscalar() for l in ls])
+            n += sum([l.size for l in ls])
            train_acc_sum += sum([(y_hat.argmax(axis=1) == y).sum().asscalar()
                                 for y_hat, y in zip(y_hats, ys)])
-            train_l_sum += sum([l.sum().asscalar() for l in ls])
-            trainer.step(batch_size)
-            n += batch_size
            m += sum([y.size for y in ys])
        test_acc = evaluate_accuracy(test_iter, net, ctx)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, '
              'time %.1f sec'
-              % (epoch, train_l_sum / n, train_acc_sum / m, test_acc,
+              % (epoch + 1, train_l_sum / n, train_acc_sum / m, test_acc,
                 time.time() - start))


 def train_2d(trainer):
-    """Train a 2d object function with a customized trainer"""
+    """Optimize the objective function of 2d variables with a customized trainer."""
    x1, x2 = -5, -2
    s_x1, s_x2 = 0, 0
    res = [(x1, x2)]
@@ -590,9 +584,9 @@ def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
    for epoch in range(num_epochs):
        if not is_random_iter:
            state = init_rnn_state(batch_size, num_hiddens, ctx)
-        loss_sum, start = 0.0, time.time()
+        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
-        for t, (X, Y) in enumerate(data_iter):
+        for X, Y in data_iter:
            if is_random_iter:
                state = init_rnn_state(batch_size, num_hiddens, ctx)
            else:
@@ -607,11 +601,12 @@ def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
            l.backward()
            grad_clipping(params, clipping_theta, ctx)
            sgd(params, lr, 1)
-            loss_sum += l.asscalar()
+            l_sum += l.asscalar() * y.size
+            n += y.size

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
-                epoch + 1, math.exp(loss_sum / (t + 1)), time.time() - start))
+                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn(
                    prefix, pred_len, rnn, params, init_rnn_state,
@@ -629,11 +624,11 @@ def train_and_predict_rnn_gluon(model, num_hiddens, vocab_size, ctx,
                            {'learning_rate': lr, 'momentum': 0, 'wd': 0})

    for epoch in range(num_epochs):
-        loss_sum, start = 0.0, time.time()
+        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_consecutive(
            corpus_indices, batch_size, num_steps, ctx)
        state = model.begin_state(batch_size=batch_size, ctx=ctx)
-        for t, (X, Y) in enumerate(data_iter):
+        for X, Y in data_iter:
            for s in state:
                s.detach()
            with autograd.record():
@@ -644,63 +639,64 @@ def train_and_predict_rnn_gluon(model, num_hiddens, vocab_size, ctx,
            params = [p.data() for p in model.collect_params().values()]
            grad_clipping(params, clipping_theta, ctx)
            trainer.step(1)
-            loss_sum += l.asscalar()
+            l_sum += l.asscalar() * y.size
+            n += y.size

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
-                epoch + 1, math.exp(loss_sum / (t + 1)), time.time() - start))
+                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn_gluon(
-                    prefix, pred_len, model, vocab_size,
-                    ctx, idx_to_char, char_to_idx))
+                    prefix, pred_len, model, vocab_size, ctx, idx_to_char,
+                    char_to_idx))


 def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
              params=None, lr=None, trainer=None):
-    """Train and evaluate a model on CPU."""
-    for epoch in range(1, num_epochs + 1):
-        train_l_sum = 0
-        train_acc_sum = 0
+    """Train and evaluate a model with CPU."""
+    for epoch in range(num_epochs):
+        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            with autograd.record():
                y_hat = net(X)
-                l = loss(y_hat, y)
+                l = loss(y_hat, y).sum()
            l.backward()
            if trainer is None:
                sgd(params, lr, batch_size)
            else:
                trainer.step(batch_size)
-            train_l_sum += l.mean().asscalar()
-            train_acc_sum += accuracy(y_hat, y)
+            y = y.astype('float32')
+            train_l_sum += l.asscalar()
+            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
+            n += y.size
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
-              % (epoch, train_l_sum / len(train_iter),
-                 train_acc_sum / len(train_iter), test_acc))
+              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))


 def train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx,
              num_epochs):
-    """Train and evaluate a model on CPU or GPU."""
+    """Train and evaluate a model with CPU or GPU."""
    print('training on', ctx)
    loss = gloss.SoftmaxCrossEntropyLoss()
-    for epoch in range(1, num_epochs + 1):
-        train_l_sum = 0
-        train_acc_sum = 0
-        start = time.time()
+    for epoch in range(num_epochs):
+        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X, y = X.as_in_context(ctx), y.as_in_context(ctx)
            with autograd.record():
                y_hat = net(X)
-                l = loss(y_hat, y)
+                l = loss(y_hat, y).sum()
            l.backward()
            trainer.step(batch_size)
-            train_l_sum += l.mean().asscalar()
-            train_acc_sum += accuracy(y_hat, y)
+            y = y.astype('float32')
+            train_l_sum += l.asscalar()
+            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
+            n += y.size
        test_acc = evaluate_accuracy(test_iter, net, ctx)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, '
              'time %.1f sec'
-              % (epoch, train_l_sum / len(train_iter),
-                 train_acc_sum / len(train_iter), test_acc, time.time() - start))
+              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc,
+                 time.time() - start))


 def train_ch7(trainer_fn, states, hyperparams, features, labels, batch_size=10,
@@ -796,7 +792,7 @@ def use_svg_display():


 def voc_label_indices(colormap, colormap2label):
-    """Assig label indices for Pascal VOC2012 Dataset."""
+    """Assign label indices for Pascal VOC2012 Dataset."""
    colormap = colormap.astype('int32')
    idx = ((colormap[:, :, 0] * 256 + colormap[:, :, 1]) * 256
           + colormap[:, :, 2])
@@ -838,4 +834,3 @@ class VOCSegDataset(gdata.Dataset):

    def __len__(self):
        return len(self.data)
-
--- a/img/finetune.svg
+++ b/img/finetune.svg
@@ -90,21 +90,27 @@
 <path style="stroke:none;" d="M 1.125 0 L 1.125 -5.625 L 5.625 -5.625 L 5.625 0 Z M 1.265625 -0.140625 L 5.484375 -0.140625 L 5.484375 -5.484375 L 1.265625 -5.484375 Z M 1.265625 -0.140625 "/>
 </symbol>
 <symbol overflow="visible" id="glyph1-1">
-<path style="stroke:none;" d="M 2.171875 0 L 3.1875 -4.859375 C 2.75 -4.515625 2.128906 -4.238281 1.328125 -4.03125 L 1.484375 -4.75 C 1.878906 -4.914062 2.269531 -5.125 2.65625 -5.375 C 3.039062 -5.625 3.328125 -5.847656 3.515625 -6.046875 C 3.640625 -6.160156 3.753906 -6.300781 3.859375 -6.46875 L 4.3125 -6.46875 L 2.96875 0 Z M 2.171875 0 "/>
+<path style="stroke:none;" d="M 3.359375 0 L 2.5625 0 L 2.5625 -5.046875 C 2.375 -4.859375 2.125 -4.671875 1.8125 -4.484375 C 1.5 -4.304688 1.222656 -4.175781 0.984375 -4.09375 L 0.984375 -4.859375 C 1.421875 -5.054688 1.804688 -5.300781 2.140625 -5.59375 C 2.472656 -5.894531 2.707031 -6.1875 2.84375 -6.46875 L 3.359375 -6.46875 Z M 3.359375 0 "/>
 </symbol>
 <symbol overflow="visible" id="glyph1-2">
 <path style="stroke:none;" d=""/>
 </symbol>
 <symbol overflow="visible" id="glyph1-3">
+<path style="stroke:none;" d="M 0.28125 -1.9375 L 0.28125 -2.734375 L 2.71875 -2.734375 L 2.71875 -1.9375 Z M 0.28125 -1.9375 "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-0">
+<path style="stroke:none;" d="M 1.125 0 L 1.125 -5.625 L 5.625 -5.625 L 5.625 0 Z M 1.265625 -0.140625 L 5.484375 -0.140625 L 5.484375 -5.484375 L 1.265625 -5.484375 Z M 1.265625 -0.140625 "/>
+</symbol>
+<symbol overflow="visible" id="glyph2-1">
 <path style="stroke:none;" d="M 0.359375 0 L 1.703125 -6.4375 L 2.5625 -6.4375 L 1.375 -0.734375 L 4.71875 -0.734375 L 4.5625 0 Z M 0.359375 0 "/>
 </symbol>
-<symbol overflow="visible" id="glyph1-4">
-<path style="stroke:none;" d="M 0.421875 -1.9375 L 0.578125 -2.734375 L 3.015625 -2.734375 L 2.84375 -1.9375 Z M 0.421875 -1.9375 "/>
+<symbol overflow="visible" id="glyph2-2">
+<path style="stroke:none;" d=""/>
 </symbol>
-<symbol overflow="visible" id="glyph2-0">
+<symbol overflow="visible" id="glyph3-0">
 <path style="stroke:none;" d="M 3.515625 -5.46875 L 1 -5.46875 L 1 -0.859375 L 3.515625 -0.859375 Z M 4.34375 -6.28125 L 4.34375 -0.03125 L 0.15625 -0.03125 L 0.15625 -6.28125 Z M 4.34375 -6.28125 "/>
 </symbol>
-<symbol overflow="visible" id="glyph2-1">
+<symbol overflow="visible" id="glyph3-1">
 <path style="stroke:none;" d="M 6.78125 -1.390625 L 6.78125 0 L 8.203125 0 L 8.203125 -1.390625 Z M 3.78125 -1.390625 L 3.78125 0 L 5.203125 0 L 5.203125 -1.390625 Z M 0.796875 -1.390625 L 0.796875 0 L 2.203125 0 L 2.203125 -1.390625 Z M 0.796875 -1.390625 "/>
 </symbol>
 </g>
@@ -135,20 +141,18 @@
  <use xlink:href="#glyph0-6" x="91.32816" y="89.5"/>
 </g>
 <g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
-  <use xlink:href="#glyph1-3" x="94.32516" y="89.5"/>
-</g>
-<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
-  <use xlink:href="#glyph1-2" x="99.16806" y="89.5"/>
+  <use xlink:href="#glyph2-1" x="94.32516" y="89.5"/>
+  <use xlink:href="#glyph2-2" x="99.16806" y="89.5"/>
 </g>
 <g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
-  <use xlink:href="#glyph1-4" x="101.66826" y="89.5"/>
-  <use xlink:href="#glyph1-2" x="104.66526" y="89.5"/>
+  <use xlink:href="#glyph1-3" x="101.668422" y="89.5"/>
+  <use xlink:href="#glyph1-2" x="104.665422" y="89.5"/>
 </g>
 <g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
-  <use xlink:href="#glyph1-1" x="107.16546" y="89.5"/>
+  <use xlink:href="#glyph1-1" x="107.165622" y="89.5"/>
 </g>
 <g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
-  <use xlink:href="#glyph1-2" x="112.17126" y="89.5"/>
+  <use xlink:href="#glyph2-2" x="112.171352" y="89.5"/>
 </g>
 <g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
  <use xlink:href="#glyph0-7" x="114.67184" y="89.5"/>
@@ -174,7 +178,6 @@
 </g>
 <g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
  <use xlink:href="#glyph1-1" x="235.74557" y="164"/>
-  <use xlink:href="#glyph1-2" x="240.75137" y="164"/>
 </g>
 <g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
  <use xlink:href="#glyph0-7" x="243.25143" y="164"/>
@@ -185,20 +188,18 @@
  <use xlink:href="#glyph0-6" x="226.32816" y="89.5"/>
 </g>
 <g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
-  <use xlink:href="#glyph1-3" x="229.32516" y="89.5"/>
-</g>
-<g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
-  <use xlink:href="#glyph1-2" x="234.16806" y="89.5"/>
+  <use xlink:href="#glyph2-1" x="229.32516" y="89.5"/>
+  <use xlink:href="#glyph2-2" x="234.16806" y="89.5"/>
 </g>
 <g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
-  <use xlink:href="#glyph1-4" x="236.66826" y="89.5"/>
-  <use xlink:href="#glyph1-2" x="239.66526" y="89.5"/>
+  <use xlink:href="#glyph1-3" x="236.668422" y="89.5"/>
+  <use xlink:href="#glyph1-2" x="239.665422" y="89.5"/>
 </g>
 <g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
-  <use xlink:href="#glyph1-1" x="242.16546" y="89.5"/>
+  <use xlink:href="#glyph1-1" x="242.165622" y="89.5"/>
 </g>
 <g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
-  <use xlink:href="#glyph1-2" x="247.17126" y="89.5"/>
+  <use xlink:href="#glyph2-2" x="247.171352" y="89.5"/>
 </g>
 <g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
  <use xlink:href="#glyph0-7" x="249.67184" y="89.5"/>
@@ -254,10 +255,10 @@
  <use xlink:href="#glyph0-20" x="247.3475" y="16"/>
 </g>
 <g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
-  <use xlink:href="#glyph2-1" x="99" y="127.369469"/>
+  <use xlink:href="#glyph3-1" x="99" y="127.369469"/>
 </g>
 <g style="fill:rgb(0%,0%,0%);fill-opacity:1;">
-  <use xlink:href="#glyph2-1" x="234" y="127.369469"/>
+  <use xlink:href="#glyph3-1" x="234" y="127.369469"/>
 </g>
 <path style="fill:none;stroke-width:1;stroke-linecap:round;stroke-linejoin:round;stroke:rgb(0%,0%,0%);stroke-opacity:1;stroke-miterlimit:10;" d="M 144.183594 250 L 144.269531 243.898438 " transform="matrix(1,0,0,1,-41,-103)"/>
 <path style="fill:none;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke:rgb(0%,0%,0%);stroke-opacity:1;stroke-miterlimit:10;" d="M 144.324219 239.898438 L 144.269531 243.898438 M 142.769531 243.878906 L 144.324219 239.898438 L 145.769531 243.921875 " transform="matrix(1,0,0,1,-41,-103)"/>