add notebook test

20f19768 · Hui Zhang · c607bff2 · 20f19768 · 20f19768 · 20f19768
4 changed file
--- a/.notebook/Linear_test.ipynb
+++ b/.notebook/Linear_test.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "academic-surname",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  def convert_to_list(value, n, name, dtype=np.int):\n"
+     ]
+    }
+   ],
+   "source": [
+    "import paddle\n",
+    "from paddle import nn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "fundamental-treasure",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "L = nn.Linear(256, 2048)\n",
+    "L2 = nn.Linear(2048, 256)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "consolidated-elephant",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import torch\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "moderate-noise",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "float64\n",
+      "Tensor(shape=[2, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
+      "       [[[-0.03137276,  0.75036579, -0.62955737, ..., -0.39516482,  2.41965628,  0.19466873],\n",
+      "         [ 0.55916852,  1.13357353,  0.28754908, ...,  0.28860641,  0.48257691, -1.07664418],\n",
+      "         [-0.27433595, -0.05911482,  0.04942252, ...,  0.46596146,  1.24395037, -1.98374581],\n",
+      "         ...,\n",
+      "         [-0.45322138,  0.51459873,  0.28475651, ..., -0.90797561, -0.80436397, -2.30388594],\n",
+      "         [ 0.20310247,  1.90435207, -1.02483511, ..., -1.59850407, -0.30733466,  0.49769276],\n",
+      "         [-2.63085651, -0.52244109,  0.32019949, ...,  1.10662329, -0.55995786, -0.36770794]],\n",
+      "\n",
+      "        [[-1.78831303,  2.24759626,  0.41386250, ..., -0.30020580, -0.16084948,  0.93251175],\n",
+      "         [ 0.03264519, -0.92942363,  1.58523536, ...,  1.23681784, -0.94711000,  0.63553023],\n",
+      "         [-0.19725564, -2.38587499, -0.29334834, ...,  0.83498263, -0.58492625,  0.58732986],\n",
+      "         ...,\n",
+      "         [-0.61646742, -1.02978027,  0.45410269, ...,  0.87052751, -0.20801133,  2.17943859],\n",
+      "         [-0.67230755, -0.79410625, -0.13054833, ..., -1.18138039, -0.47578079, -0.22610545],\n",
+      "         [ 2.57333422,  0.63872230,  0.70852041, ..., -0.44040251, -0.33339104, -0.24722832]]])\n",
+      "tensor([[[-0.0314,  0.7504, -0.6296,  ..., -0.3952,  2.4197,  0.1947],\n",
+      "         [ 0.5592,  1.1336,  0.2875,  ...,  0.2886,  0.4826, -1.0766],\n",
+      "         [-0.2743, -0.0591,  0.0494,  ...,  0.4660,  1.2440, -1.9837],\n",
+      "         ...,\n",
+      "         [-0.4532,  0.5146,  0.2848,  ..., -0.9080, -0.8044, -2.3039],\n",
+      "         [ 0.2031,  1.9044, -1.0248,  ..., -1.5985, -0.3073,  0.4977],\n",
+      "         [-2.6309, -0.5224,  0.3202,  ...,  1.1066, -0.5600, -0.3677]],\n",
+      "\n",
+      "        [[-1.7883,  2.2476,  0.4139,  ..., -0.3002, -0.1608,  0.9325],\n",
+      "         [ 0.0326, -0.9294,  1.5852,  ...,  1.2368, -0.9471,  0.6355],\n",
+      "         [-0.1973, -2.3859, -0.2933,  ...,  0.8350, -0.5849,  0.5873],\n",
+      "         ...,\n",
+      "         [-0.6165, -1.0298,  0.4541,  ...,  0.8705, -0.2080,  2.1794],\n",
+      "         [-0.6723, -0.7941, -0.1305,  ..., -1.1814, -0.4758, -0.2261],\n",
+      "         [ 2.5733,  0.6387,  0.7085,  ..., -0.4404, -0.3334, -0.2472]]])\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "x = np.random.randn(2, 51, 256)\n",
+    "print(x.dtype)\n",
+    "px = paddle.to_tensor(x, dtype='float32')\n",
+    "tx = torch.tensor(x, dtype=torch.float32)\n",
+    "print(px)\n",
+    "print(tx)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cooked-progressive",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "mechanical-prisoner",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n",
+    "t_norm_ff = data['norm_ff']\n",
+    "t_ff_out = data['ff_out']\n",
+    "t_ff_l_x = data['ff_l_x']\n",
+    "t_ff_l_a_x = data['ff_l_a_x']\n",
+    "t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
+    "t_ps = data['ps']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "indie-marriage",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "assured-zambia",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n",
+      "True\n",
+      "True\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "L.set_state_dict({'weight': t_ps[0].T, 'bias': t_ps[1]})\n",
+    "L2.set_state_dict({'weight': t_ps[2].T, 'bias': t_ps[3]})\n",
+    "\n",
+    "ps = []\n",
+    "for n, p in L.named_parameters():\n",
+    "   ps.append(p)\n",
+    "\n",
+    "for n, p in L2.state_dict().items():\n",
+    "    ps.append(p)\n",
+    "    \n",
+    "for p, tp in zip(ps, t_ps):\n",
+    "    print(np.allclose(p.numpy(), tp.T))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "committed-jacob",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "extreme-traffic",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "optimum-milwaukee",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "viral-indian",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n",
+      "True\n",
+      "True\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "# data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n",
+    "# t_norm_ff = data['norm_ff']\n",
+    "# t_ff_out = data['ff_out']\n",
+    "# t_ff_l_x = data['ff_l_x']\n",
+    "# t_ff_l_a_x = data['ff_l_a_x']\n",
+    "# t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
+    "# t_ps = data['ps']\n",
+    "TL = torch.nn.Linear(256, 2048)\n",
+    "TL2 = torch.nn.Linear(2048, 256)\n",
+    "TL.load_state_dict({'weight': torch.tensor(t_ps[0]), 'bias': torch.tensor(t_ps[1])})\n",
+    "TL2.load_state_dict({'weight': torch.tensor(t_ps[2]), 'bias': torch.tensor(t_ps[3])})\n",
+    "\n",
+    "# for n, p in TL.named_parameters():\n",
+    "#    print(n, p)\n",
+    "# for n, p in TL2.named_parameters():\n",
+    "#    print(n, p)\n",
+    "\n",
+    "ps = []\n",
+    "for n, p in TL.state_dict().items():\n",
+    "    ps.append(p.data.numpy())\n",
+    "    \n",
+    "for n, p in TL2.state_dict().items():\n",
+    "    ps.append(p.data.numpy())\n",
+    "    \n",
+    "for p, tp in zip(ps, t_ps):\n",
+    "    print(np.allclose(p, tp))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "skilled-vietnamese",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[-0.25528666 -0.9090747   0.12996    ...  0.02552819  0.37376517\n",
+      "  -0.558986  ]\n",
+      " [-0.45657372  0.23811203  0.33472425 ...  1.0797666  -0.7263612\n",
+      "   0.31549692]]\n",
+      "[[-0.25528657 -0.9090746   0.12996009 ...  0.02552832  0.37376505\n",
+      "  -0.5589858 ]\n",
+      " [-0.45657367  0.23811209  0.33472428 ...  1.0797666  -0.7263612\n",
+      "   0.31549698]]\n",
+      "True\n",
+      "False\n"
+     ]
+    }
+   ],
+   "source": [
+    "y = L(px)\n",
+    "print(y.numpy())\n",
+    "\n",
+    "ty = TL(tx)\n",
+    "print(ty.data.numpy())\n",
+    "print(np.allclose(px.numpy(), tx.detach().numpy()))\n",
+    "print(np.allclose(y.numpy(), ty.detach().numpy()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "incorrect-allah",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "prostate-cameroon",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "governmental-surge",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[ 0.07453135  0.0698561   0.6273111  ...  0.5845924  -0.65527105\n",
+      "   0.5881643 ]\n",
+      " [ 0.3902049  -0.17455879 -1.1802813  ... -0.36912322  0.55681896\n",
+      "  -0.11917676]]\n",
+      "[[ 0.07453132  0.06985616  0.62731117 ...  0.5845925  -0.65527105\n",
+      "   0.5881642 ]\n",
+      " [ 0.39020485 -0.17455864 -1.1802814  ... -0.3691232   0.556819\n",
+      "  -0.11917675]]\n",
+      "True\n",
+      "False\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "x = np.random.randn(2, 256)\n",
+    "px = paddle.to_tensor(x, dtype='float32')\n",
+    "tx = torch.tensor(x, dtype=torch.float32)\n",
+    "y = L(px)\n",
+    "print(y.numpy())\n",
+    "ty = TL(tx)\n",
+    "print(ty.data.numpy())\n",
+    "print(np.allclose(px.numpy(), tx.detach().numpy()))\n",
+    "print(np.allclose(y.numpy(), ty.detach().numpy()))\n",
+    "print(np.allclose(y.numpy(), ty.detach().numpy(), atol=1e-5))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "confidential-jacket",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "improved-civilization",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/.notebook/layer_norm_test.ipynb
+++ b/.notebook/layer_norm_test.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "academic-surname",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle\n",
+    "from paddle import nn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "fundamental-treasure",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parameter containing:\n",
+      "Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
+      "       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])\n",
+      "Parameter containing:\n",
+      "Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
+      "       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])\n"
+     ]
+    }
+   ],
+   "source": [
+    "L = nn.LayerNorm(256, epsilon=1e-12)\n",
+    "for p in L.parameters():\n",
+    "    print(p)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "consolidated-elephant",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "moderate-noise",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "float64\n"
+     ]
+    }
+   ],
+   "source": [
+    "x = np.random.randn(2, 51, 256)\n",
+    "print(x.dtype)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "cooked-progressive",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y = L(paddle.to_tensor(x, dtype='float32'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "optimum-milwaukee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "viral-indian",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parameter containing:\n",
+      "tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1.], requires_grad=True)\n",
+      "Parameter containing:\n",
+      "tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
+      "       requires_grad=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "TL = torch.nn.LayerNorm(256, eps=1e-12)\n",
+    "for p in TL.parameters():\n",
+    "    print(p)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "skilled-vietnamese",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ty = TL(torch.tensor(x, dtype=torch.float32))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "incorrect-allah",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 51,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.allclose(y.numpy(), ty.detach().numpy())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "prostate-cameroon",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "governmental-surge",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x = np.random.randn(2, 256)\n",
+    "y = L(paddle.to_tensor(x, dtype='float32'))\n",
+    "ty = TL(torch.tensor(x, dtype=torch.float32))\n",
+    "np.allclose(y.numpy(), ty.detach().numpy())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "confidential-jacket",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/.notebook/position_embeding_check.ipynb
+++ b/.notebook/position_embeding_check.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "designing-borough",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00\n",
+      "   0.0000000e+00  0.0000000e+00]\n",
+      " [ 8.4147096e-01  8.0196178e-01  7.6172036e-01 ...  1.2409373e-04\n",
+      "   1.1547816e-04  1.0746076e-04]\n",
+      " [ 9.0929741e-01  9.5814437e-01  9.8704624e-01 ...  2.4818745e-04\n",
+      "   2.3095631e-04  2.1492151e-04]\n",
+      " ...\n",
+      " [ 3.7960774e-01  7.4510968e-01  7.3418564e-01 ...  1.2036801e-02\n",
+      "   1.1201146e-02  1.0423505e-02]\n",
+      " [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ...  1.2160885e-02\n",
+      "   1.1316618e-02  1.0530960e-02]\n",
+      " [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ...  1.2284970e-02\n",
+      "   1.1432089e-02  1.0638415e-02]]\n",
+      "True\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import math\n",
+    "import numpy as np\n",
+    "\n",
+    "max_len=100\n",
+    "d_model=256\n",
+    "\n",
+    "pe = torch.zeros(max_len, d_model)\n",
+    "position = torch.arange(0, max_len,\n",
+    "                        dtype=torch.float32).unsqueeze(1)\n",
+    "toruch_position = position\n",
+    "div_term = torch.exp(\n",
+    "    torch.arange(0, d_model, 2, dtype=torch.float32) *\n",
+    "    -(math.log(10000.0) / d_model))\n",
+    "tourch_div_term = div_term.cpu().detach().numpy()\n",
+    "\n",
+    "\n",
+    "\n",
+    "torhc_sin = torch.sin(position * div_term)\n",
+    "torhc_cos = torch.cos(position * div_term)\n",
+    "print(torhc_sin.cpu().detach().numpy())\n",
+    "np_sin = np.sin((position * div_term).cpu().detach().numpy())\n",
+    "np_cos = np.cos((position * div_term).cpu().detach().numpy())\n",
+    "print(np.allclose(np_sin, torhc_sin.cpu().detach().numpy()))\n",
+    "print(np.allclose(np_cos, torhc_cos.cpu().detach().numpy()))\n",
+    "pe[:, 0::2] = torhc_sin\n",
+    "pe[:, 1::2] = torhc_cos\n",
+    "tourch_pe = pe.cpu().detach().numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "swiss-referral",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n",
+      "True\n",
+      "False\n",
+      "False\n",
+      "False\n",
+      "False\n"
+     ]
+    }
+   ],
+   "source": [
+    "import paddle\n",
+    "ppe = paddle.zeros([max_len, d_model])\n",
+    "position = paddle.arange(0, max_len,\n",
+    "                        dtype='float32').unsqueeze(1)\n",
+    "print(np.allclose(position.numpy(), toruch_position))\n",
+    "div_term = paddle.exp(\n",
+    "    paddle.arange(0, d_model, 2, dtype='float32') *\n",
+    "    -(math.log(10000.0) / d_model))\n",
+    "print(np.allclose(div_term.numpy(), tourch_div_term))\n",
+    "\n",
+    "\n",
+    "\n",
+    "p_sin = paddle.sin(position * div_term)\n",
+    "p_cos = paddle.cos(position * div_term)\n",
+    "print(np.allclose(np_sin, p_sin.numpy(), rtol=1.e-6, atol=0))\n",
+    "print(np.allclose(np_cos, p_cos.numpy(), rtol=1.e-6, atol=0))\n",
+    "ppe[:, 0::2] = p_sin\n",
+    "ppe[:, 1::2] = p_cos\n",
+    "print(np.allclose(p_sin.numpy(), torhc_sin.cpu().detach().numpy()))\n",
+    "print(np.allclose(p_cos.numpy(), torhc_cos.cpu().detach().numpy()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "integrated-boards",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "False\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(np.allclose(ppe.numpy(), pe.numpy()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "flying-reserve",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "revised-divide",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/.notebook/u2_model.ipynb
+++ b/.notebook/u2_model.ipynb