add three docs for paddle2.0

6cbf7fd8 · chenlong · 9fb40dac · 6cbf7fd8 · 6cbf7fd8 · 6cbf7fd8
3 changed file
--- a/paddle2.0_docs/image_classification/mnist_lenet_classification.ipynb
+++ b/paddle2.0_docs/image_classification/mnist_lenet_classification.ipynb
--- a/paddle2.0_docs/n_gram_model/n_gram_model.ipynb
+++ b/paddle2.0_docs/n_gram_model/n_gram_model.ipynb
--- a/paddle2.0_docs/text_generation/text_generation_paddle.ipynb
+++ b/paddle2.0_docs/text_generation/text_generation_paddle.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 基于GRU的Text Generation\n",
+    "文本生成是NLP领域中的重要组成部分，基于GRU，我们可以快速构建文本生成模型。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'2.0.0-alpha0'"
+      ]
+     },
+     "execution_count": 74,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import paddle\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "paddle.__version__"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 复现过程\n",
+    "## 1.下载数据\n",
+    "文件路径：https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt\n",
+    "保存为txt格式即可"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2.读取数据"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Length of text: 1115394 characters\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 文件路径\n",
+    "path_to_file = './shakespeare.txt'\n",
+    "text = open(path_to_file, 'rb').read().decode(encoding='utf-8')\n",
+    "\n",
+    "# 文本长度是指文本中的字符个数\n",
+    "print ('Length of text: {} characters'.format(len(text)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First Citizen:\n",
+      "Before we proceed any further, hear me speak.\n",
+      "\n",
+      "All:\n",
+      "Speak, speak.\n",
+      "\n",
+      "First Citizen:\n",
+      "You are all resolved rather to die than to famish?\n",
+      "\n",
+      "All:\n",
+      "Resolved. resolved.\n",
+      "\n",
+      "First Citizen:\n",
+      "First, you know Caius Marcius is chief enemy to the people.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 看一看文本中的前 250 个字符\n",
+    "print(text[:250])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "65 unique characters\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 文本中的非重复字符\n",
+    "vocab = sorted(set(text))\n",
+    "print ('{} unique characters'.format(len(vocab)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3.向量化文本\n",
+    "在训练之前，我们需要将字符串映射到数字表示值。创建两个查找表格：一个将字符映射到数字，另一个将数字映射到字符。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 创建从非重复字符到索引的映射\n",
+    "char2idx = {u:i for i, u in enumerate(vocab)}\n",
+    "idx2char = np.array(vocab)\n",
+    "# 用index表示文本\n",
+    "text_as_int = np.array([char2idx[c] for c in text])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'\\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, \"'\": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(char2idx)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['\\n' ' ' '!' '$' '&' \"'\" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'\n",
+      " 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'\n",
+      " 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'\n",
+      " 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(idx2char)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "现在，每个字符都有一个整数表示值。请注意，我们将字符映射至索引 0 至 len(vocab)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[18 47 56 ... 45  8  0]\n",
+      "1115394\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(text_as_int)\n",
+    "print(len(text_as_int))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "'First Citizen' ---- characters mapped to int ---- > [18 47 56 57 58  1 15 47 58 47 64 43 52]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 显示文本首 13 个字符的整数映射\n",
+    "print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 预测任务\n",
+    "给定一个字符或者一个字符序列，下一个最可能出现的字符是什么？这就是我们训练模型要执行的任务。输入进模型的是一个字符序列，我们训练这个模型来预测输出 -- 每个时间步（time step）预测下一个字符是什么。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 创建训练样本和目标\n",
+    "接下来，将文本划分为样本序列。每个输入序列包含文本中的 seq_length 个字符。\n",
+    "\n",
+    "对于每个输入序列，其对应的目标包含相同长度的文本，但是向右顺移一个字符。\n",
+    "\n",
+    "将文本拆分为长度为 seq_length 的文本块。例如，假设 seq_length 为 4 而且文本为 “Hello”， 那么输入序列将为 “Hell”，目标序列将为 “ello”。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "seq_length = 100\n",
+    "def load_data(data, seq_length):\n",
+    "    train_data = []\n",
+    "    train_label = []\n",
+    "    for i in range(len(data)//seq_length):\n",
+    "        train_data.append(data[i*seq_length:(i+1)*seq_length])\n",
+    "        train_label.append(data[i*seq_length + 1:(i+1)*seq_length+1])\n",
+    "    return train_data, train_label\n",
+    "train_data, train_label = load_data(text_as_int, seq_length)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "training data is :\n",
+      "First Citizen:\n",
+      "Before we proceed any further, hear me speak.\n",
+      "\n",
+      "All:\n",
+      "Speak, speak.\n",
+      "\n",
+      "First Citizen:\n",
+      "You\n",
+      "------------\n",
+      "training_label is:\n",
+      "irst Citizen:\n",
+      "Before we proceed any further, hear me speak.\n",
+      "\n",
+      "All:\n",
+      "Speak, speak.\n",
+      "\n",
+      "First Citizen:\n",
+      "You \n"
+     ]
+    }
+   ],
+   "source": [
+    "char_list = []\n",
+    "label_list = []\n",
+    "for char_id, label_id in zip(train_data[0], train_label[0]):\n",
+    "    char_list.append(idx2char[char_id])\n",
+    "    label_list.append(idx2char[label_id])\n",
+    "\n",
+    "print('training data is :')\n",
+    "print(''.join(char_list))\n",
+    "print(\"------------\")\n",
+    "print('training_label is:')\n",
+    "print(''.join(label_list))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 用`paddle.batch`完成数据的加载"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "batch_size = 64\n",
+    "def train_reader():\n",
+    "    for i in range(len(train_data)):\n",
+    "        yield train_data[i], train_label[i]\n",
+    "batch_reader = paddle.batch(train_reader, batch_size=batch_size)    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 基于GRU构建文本生成模型"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle\n",
+    "import numpy as np\n",
+    "\n",
+    "vocab_size = len(vocab)\n",
+    "embedding_dim = 256\n",
+    "hidden_size = 1024\n",
+    "class GRUModel(paddle.nn.Layer):\n",
+    "    def __init__(self):\n",
+    "        super(GRUModel, self).__init__()\n",
+    "        self.embedding = paddle.nn.Embedding(size=[vocab_size, embedding_dim])\n",
+    "        self.gru = paddle.incubate.hapi.text.GRU(input_size=embedding_dim, hidden_size=hidden_size)\n",
+    "        self.linear1 = paddle.nn.Linear(hidden_size, hidden_size//2)\n",
+    "        self.linear2 = paddle.nn.Linear(hidden_size//2, vocab_size)\n",
+    "    def forward(self, x):\n",
+    "        x = self.embedding(x)\n",
+    "        x = paddle.reshape(x, [-1, 1, embedding_dim])\n",
+    "        x, _ = self.gru(x)\n",
+    "        x = paddle.reshape(x, [-1, hidden_size])\n",
+    "        x = self.linear1(x)\n",
+    "        x = paddle.nn.functional.relu(x)\n",
+    "        x = self.linear2(x)\n",
+    "        x = paddle.nn.functional.softmax(x)\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch: 0, batch: 50, loss is: [3.7835407]\n",
+      "epoch: 0, batch: 100, loss is: [3.2774005]\n",
+      "epoch: 0, batch: 150, loss is: [3.2576294]\n",
+      "epoch: 1, batch: 50, loss is: [3.3434656]\n",
+      "epoch: 1, batch: 100, loss is: [2.9948606]\n",
+      "epoch: 1, batch: 150, loss is: [3.0285468]\n",
+      "epoch: 2, batch: 50, loss is: [3.133882]\n",
+      "epoch: 2, batch: 100, loss is: [2.7811327]\n",
+      "epoch: 2, batch: 150, loss is: [2.8133557]\n",
+      "epoch: 3, batch: 50, loss is: [3.000814]\n",
+      "epoch: 3, batch: 100, loss is: [2.6404488]\n",
+      "epoch: 3, batch: 150, loss is: [2.7050896]\n",
+      "epoch: 4, batch: 50, loss is: [2.9289591]\n",
+      "epoch: 4, batch: 100, loss is: [2.5629177]\n",
+      "epoch: 4, batch: 150, loss is: [2.6438713]\n",
+      "epoch: 5, batch: 50, loss is: [2.8832304]\n",
+      "epoch: 5, batch: 100, loss is: [2.5137548]\n",
+      "epoch: 5, batch: 150, loss is: [2.5926144]\n",
+      "epoch: 6, batch: 50, loss is: [2.8562953]\n",
+      "epoch: 6, batch: 100, loss is: [2.4752126]\n",
+      "epoch: 6, batch: 150, loss is: [2.5510798]\n",
+      "epoch: 7, batch: 50, loss is: [2.8426895]\n",
+      "epoch: 7, batch: 100, loss is: [2.4442513]\n",
+      "epoch: 7, batch: 150, loss is: [2.5187433]\n",
+      "epoch: 8, batch: 50, loss is: [2.8353484]\n",
+      "epoch: 8, batch: 100, loss is: [2.4200597]\n",
+      "epoch: 8, batch: 150, loss is: [2.4956212]\n",
+      "epoch: 9, batch: 50, loss is: [2.8308532]\n",
+      "epoch: 9, batch: 100, loss is: [2.4011066]\n",
+      "epoch: 9, batch: 150, loss is: [2.4787998]\n"
+     ]
+    }
+   ],
+   "source": [
+    "paddle.enable_imperative()\n",
+    "losses = []\n",
+    "def train(model):\n",
+    "    model.train()\n",
+    "    optim = paddle.optimizer.SGD(learning_rate=0.001, parameter_list=model.parameters())\n",
+    "    for epoch in range(10):\n",
+    "        batch_id = 0\n",
+    "        for batch_data in batch_reader():\n",
+    "            batch_id += 1\n",
+    "            data = np.array(batch_data)\n",
+    "            x_data = data[:, 0]\n",
+    "            y_data = data[:, 1]\n",
+    "            for i in range(len(x_data[0])):\n",
+    "                x_char = x_data[:, i]\n",
+    "                y_char = y_data[:, i]\n",
+    "                x_char = paddle.imperative.to_variable(x_char)\n",
+    "                y_char = paddle.imperative.to_variable(y_char)\n",
+    "                predicts = model(x_char)\n",
+    "                loss = paddle.nn.functional.cross_entropy(predicts, y_char)\n",
+    "                avg_loss = paddle.mean(loss)\n",
+    "                avg_loss.backward()\n",
+    "                optim.minimize(avg_loss)\n",
+    "                model.clear_gradients()\n",
+    "            if batch_id % 50 == 0:\n",
+    "                print(\"epoch: {}, batch: {}, loss is: {}\".format(epoch, batch_id, avg_loss.numpy()))\n",
+    "        losses.append(loss.numpy())\n",
+    "model = GRUModel()\n",
+    "train(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 模型预测\n",
+    "利用训练好的模型，输出初始化文本'ROMEO: '，自动生成后续的num_generate个字符。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ROMEO:I the the the the the the the the the the the the the the the the the the the the the the the the th\n"
+     ]
+    }
+   ],
+   "source": [
+    "def generate_text(model, start_string):\n",
+    "    \n",
+    "    model.eval()\n",
+    "    num_generate = 100\n",
+    "\n",
+    "    # Converting our start string to numbers (vectorizing)\n",
+    "    input_eval = [char2idx[s] for s in start_string]\n",
+    "    input_data = paddle.imperative.to_variable(np.array(input_eval))\n",
+    "    input_data = paddle.reshape(input_data, [-1, 1])\n",
+    "    text_generated = []\n",
+    "\n",
+    "    for i in range(num_generate):\n",
+    "        predicts = model(input_data)\n",
+    "        predicts = predicts.numpy().tolist()[0]\n",
+    "        # print(predicts)\n",
+    "        predicts_id = predicts.index(max(predicts))\n",
+    "        # print(predicts_id)\n",
+    "        # using a categorical distribution to predict the character returned by the model\n",
+    "        input_data = paddle.imperative.to_variable(np.array([predicts_id]))\n",
+    "        input_data = paddle.reshape(input_data, [-1, 1])\n",
+    "        text_generated.append(idx2char[predicts_id])\n",
+    "    return (start_string + ''.join(text_generated))\n",
+    "print(generate_text(model, start_string=u\"ROMEO:\"))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}