update docker build

d8eab128 · Yancey1989 · 3c9ec464 · d8eab128 · d8eab128 · d8eab128
8 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,5 @@ pandoc.template
 .DS_Store
 .idea
 py_env*
+*.ipynb
+build
--- a/.tools/build.sh
+++ b/.tools/build.sh
+#!/bin/bash
+
+set -e
+# Conver markdown to ipynb
+/book/.tools/convert-markdown-into-ipynb-and-test.sh
+
+# Cache dataset
+/book/.tools/cache_dataset.py
--- a/.tools/build_docker.sh
+++ b/.tools/build_docker.sh
@@ -12,8 +12,8 @@ fi
 #convert md to ipynb
 .tools/convert-markdown-into-ipynb-and-test.sh

-paddle_version=0.10.0rc2
-latest_label=latest
+paddle_tag=0.10.0rc2
+latest_tag=latest

 #generate docker file
 if [ ${USE_UBUNTU_REPO_MIRROR} ]; then
@@ -23,38 +23,25 @@ else
 fi

 mkdir -p build
-cat > build/Dockerfile <<EOF1
-FROM paddlepaddle/paddle:${paddle_version}
+
+cat > ./build/Dockerfile << EOF
+FROM paddlepaddle/paddle:${paddle_tag}
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>

 RUN ${update_mirror_cmd}
-        apt-get install locales
-RUN localedef -f UTF-8 -i en_US en_US.UTF-8
-
-RUN  apt-get -y install gcc && \
-        apt-get -y clean
-
-RUN pip install -U matplotlib jupyter numpy requests scipy
+    apt-get update && \
+    apt-get install -y locales && \
+    apt-get -y install gcc && \
+    apt-get -y clean && \
+    localedef -f UTF-8 -i en_US en_US.UTF-8 && \
+    pip install -U matplotlib jupyter numpy requests scipy

 COPY . /book
-RUN rm -rf /book/build
-EOF1
-
-if [ ${COPY_CACHE_DATA} ]; then
-
-cat >> build/Dockerfile << EOF2
-RUN mkdir -p /root/${cache_data_path}
-RUN mv /book/${cache_data_path}/* /root/${cache_data_path}/ && rm -rf /book/${cache_data_path}
-EOF2
-
-fi
-
-cat >> build/Dockerfile << EOF3
-
+RUN /book/.tools/cache_dataset.py
 EXPOSE 8888
 CMD ["sh", "-c", "jupyter notebook --ip=0.0.0.0 --no-browser --NotebookApp.token='' --NotebookApp.disable_check_xsrf=True /book/"]
-EOF3
+EOF

 #build docker image
-echo "paddle_version:"$paddle_version
-docker build --no-cache -t paddlepaddle/book:${paddle_version}  -t paddlepaddle/book:${latest_label}  -f ./build/Dockerfile .
+echo "paddle_tag:"$paddle_tag
+echo $dockerfile | docker build --no-cache -t paddlepaddle/book:${paddle_tag}  -t paddlepaddle/book:${latest_tag}  -f ./build/Dockerfile .
--- a/.tools/cache_dataset.py
+++ b/.tools/cache_dataset.py
+#!/bin/env python
+import paddle.v2.dataset as dataset
+import ntlk
+
+# Cache conll05
+dataset.common.download(dataset.conll05.WORDDICT_URL, 'conll05st', \
+                        dataset.conll05.WORDDICT_MD5)
+dataset.common.download(dataset.conll05.VERBDICT_URL, 'conll05st', \
+                        dataset.conll05.VERBDICT_MD5)
+dataset.common.download(dataset.conll05.TRGDICT_URL, 'conll05st', \
+                        dataset.conll05.TRGDICT_MD5)
+
+# Cache imdb
+dataset.common.download(dataset.imdb.URL, "imdb", dataset.imdb.MD5)
+
+# Cache imikolov
+dataset.common.download(dataset.imikolov.URL, "imikolov", dataset.imikolov.MD5)
+
+# Cache movielens
+dataset.common.download('http://files.grouplens.org/datasets/movielens/ml-1m.zip',\
+                        'movielens','c4d9eecfca2ab87c1945afe126590906')
+
+# Cache nltk
+nltk.download('movie_reviews', download_dir=dataset.common.DATA_HOME)
+
+# Cache uci housing
+dataset.common.download(dataset.uci_housing.URL, "uci_housing", dataset.uci_housing.MD5)
+
+# Cache vmt14
+dataset.common.download(dataset.vmt14.URL_TRAIN, "wmt14",dataset.vmt14.MD5_TRAIN)
--- a/.tools/convert-markdown-into-ipynb-and-test.sh
+++ b/.tools/convert-markdown-into-ipynb-and-test.sh
@@ -5,14 +5,14 @@ if [ $? -ne 0 ]; then
    exit 1
 fi

-GOPATH=~/.cache/go go get -u github.com/wangkuiyi/ipynb/markdown-to-ipynb
+export GOPATH=~/go; go get -u github.com/wangkuiyi/ipynb/markdown-to-ipynb

 cur_path="$(cd "$(dirname "$0")" && pwd -P)"
 cd $cur_path/../

 #convert md to ipynb
 for file in */{README,README\.en}.md ; do
-    ~/.cache/go/bin/markdown-to-ipynb < $file > ${file%.*}".ipynb"
+    ~/go/bin/markdown-to-ipynb < $file > ${file%.*}".ipynb"
    if [ $? -ne 0 ]; then
        echo >&2 "markdown-to-ipynb $file error"
        exit 1

--- a/03.image_classification/README.ipynb
+++ b/03.image_classification/README.ipynb
@@ -141,7 +141,7 @@
        "\n",
        "## 数据准备\n",
        "\n",
-        "通用图像分类公开的标准数据集常用的有[CIFAR](\u003chttps://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等，常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大，如[模型概览](#模型概览)一章所讲，大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化，常用的是ImageNet-2012数据集，该数据集包含1000个类别：训练集包含1,281,167张图片，每个类别数据732至1300张不等，验证集包含50,000张图片，平均每个类别50张图片。\n",
+        "通用图像分类公开的标准数据集常用的有[CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等，常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大，如[模型概览](#模型概览)一章所讲，大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化，常用的是ImageNet-2012数据集，该数据集包含1000个类别：训练集包含1,281,167张图片，每个类别数据732至1300张不等，验证集包含50,000张图片，平均每个类别50张图片。\n",
        "\n",
        "由于ImageNet数据集较大，下载和训练较慢，为了方便大家学习，我们使用[CIFAR10](\u003chttps://www.cs.toronto.edu/~kriz/cifar.html\u003e)数据集。CIFAR10数据集包含60,000张32x32的彩色图片，10个类别，每个类包含6,000张。其中50,000张图片作为训练集，10000张作为测试集。图11从每个类别中随机抽取了10张图片，展示了所有的类别。\n",
        "\n",

--- a/08.recommender_system/README.en.ipynb
+++ b/08.recommender_system/README.en.ipynb
@@ -88,33 +88,6 @@
        "We use the [MovieLens ml-1m](http://files.grouplens.org/datasets/movielens/ml-1m.zip) to train our model.  This dataset includes 10,000 ratings of 4,000 movies from 6,000 users to 4,000 movies.  Each rate is in the range of 1~5.  Thanks to GroupLens Research for collecting, processing and publishing the dataset.\n",
        "\n",
        "`paddle.v2.datasets` package encapsulates multiple public datasets, including `cifar`, `imdb`, `mnist`, `moivelens` and `wmt14`, etc. There's no need for us to manually download and preprocess `MovieLens` dataset.\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "editable": true
-      },
-      "source": [
-        "# Run this block to show dataset's documentation\n",
-        "help(paddle.v2.dataset.movielens)\n"
-      ],
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\n"
-          ]
-        }
-      ],
-      "execution_count": 1
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
        "\n",
        "The raw `MoiveLens` contains movie ratings, relevant features from both movies and users.\n",
        "For instance, one movie's feature could be:\n",
@@ -127,6 +100,7 @@
        "editable": true
      },
      "source": [
+        "import paddle.v2 as paddle\n",
        "movie_info = paddle.dataset.movielens.movie_info()\n",
        "print movie_info.values()[0]\n"
      ],
@@ -283,7 +257,6 @@
        "import cPickle\n",
        "\n",
        "import paddle.v2 as paddle\n",
-        "\n",
        "paddle.init(use_gpu=False)\n"
      ],
      "outputs": [
@@ -551,9 +524,9 @@
        "editable": true
      },
      "source": [
-        "reader=paddle.reader.batch(\n",
+        "reader=paddle.batch(\n",
        "    paddle.reader.shuffle(\n",
-        "        paddle.dataset.movielens.trai(), buf_size=8192),\n",
+        "        paddle.dataset.movielens.train(), buf_size=8192),\n",
        "        batch_size=256)\n"
      ],
      "outputs": [

--- a/08.recommender_system/README.ipynb
+++ b/08.recommender_system/README.ipynb
@@ -725,7 +725,7 @@
        "infer_dict = copy.copy(feeding)\n",
        "del infer_dict['score']\n",
        "\n",
-        "prediction = paddle.infer(output=inference, parameters=parameters, input=[feature], feeding=infer_dict)\n",
+        "prediction = paddle.infer(inference, parameters=parameters, input=[feature], feeding=infer_dict)\n",
        "score = (prediction[0][0] + 5.0) / 2\n",
        "print \"[Predict] User %d Rating Movie %d With Score %.2f\"%(user_id, movie_id, score)\n"
      ],