diff --git a/.notebook/Linear_test.ipynb b/.notebook/Linear_test.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..a11da386fd0ceeb17ca5d12c50c9505c64a52dc3
--- /dev/null
+++ b/.notebook/Linear_test.ipynb
@@ -0,0 +1,375 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "academic-surname",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  def convert_to_list(value, n, name, dtype=np.int):\n"
+     ]
+    }
+   ],
+   "source": [
+    "import paddle\n",
+    "from paddle import nn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "fundamental-treasure",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "L = nn.Linear(256, 2048)\n",
+    "L2 = nn.Linear(2048, 256)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "consolidated-elephant",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import torch\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "moderate-noise",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "float64\n",
+      "Tensor(shape=[2, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
+      "       [[[-0.03137276,  0.75036579, -0.62955737, ..., -0.39516482,  2.41965628,  0.19466873],\n",
+      "         [ 0.55916852,  1.13357353,  0.28754908, ...,  0.28860641,  0.48257691, -1.07664418],\n",
+      "         [-0.27433595, -0.05911482,  0.04942252, ...,  0.46596146,  1.24395037, -1.98374581],\n",
+      "         ...,\n",
+      "         [-0.45322138,  0.51459873,  0.28475651, ..., -0.90797561, -0.80436397, -2.30388594],\n",
+      "         [ 0.20310247,  1.90435207, -1.02483511, ..., -1.59850407, -0.30733466,  0.49769276],\n",
+      "         [-2.63085651, -0.52244109,  0.32019949, ...,  1.10662329, -0.55995786, -0.36770794]],\n",
+      "\n",
+      "        [[-1.78831303,  2.24759626,  0.41386250, ..., -0.30020580, -0.16084948,  0.93251175],\n",
+      "         [ 0.03264519, -0.92942363,  1.58523536, ...,  1.23681784, -0.94711000,  0.63553023],\n",
+      "         [-0.19725564, -2.38587499, -0.29334834, ...,  0.83498263, -0.58492625,  0.58732986],\n",
+      "         ...,\n",
+      "         [-0.61646742, -1.02978027,  0.45410269, ...,  0.87052751, -0.20801133,  2.17943859],\n",
+      "         [-0.67230755, -0.79410625, -0.13054833, ..., -1.18138039, -0.47578079, -0.22610545],\n",
+      "         [ 2.57333422,  0.63872230,  0.70852041, ..., -0.44040251, -0.33339104, -0.24722832]]])\n",
+      "tensor([[[-0.0314,  0.7504, -0.6296,  ..., -0.3952,  2.4197,  0.1947],\n",
+      "         [ 0.5592,  1.1336,  0.2875,  ...,  0.2886,  0.4826, -1.0766],\n",
+      "         [-0.2743, -0.0591,  0.0494,  ...,  0.4660,  1.2440, -1.9837],\n",
+      "         ...,\n",
+      "         [-0.4532,  0.5146,  0.2848,  ..., -0.9080, -0.8044, -2.3039],\n",
+      "         [ 0.2031,  1.9044, -1.0248,  ..., -1.5985, -0.3073,  0.4977],\n",
+      "         [-2.6309, -0.5224,  0.3202,  ...,  1.1066, -0.5600, -0.3677]],\n",
+      "\n",
+      "        [[-1.7883,  2.2476,  0.4139,  ..., -0.3002, -0.1608,  0.9325],\n",
+      "         [ 0.0326, -0.9294,  1.5852,  ...,  1.2368, -0.9471,  0.6355],\n",
+      "         [-0.1973, -2.3859, -0.2933,  ...,  0.8350, -0.5849,  0.5873],\n",
+      "         ...,\n",
+      "         [-0.6165, -1.0298,  0.4541,  ...,  0.8705, -0.2080,  2.1794],\n",
+      "         [-0.6723, -0.7941, -0.1305,  ..., -1.1814, -0.4758, -0.2261],\n",
+      "         [ 2.5733,  0.6387,  0.7085,  ..., -0.4404, -0.3334, -0.2472]]])\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "x = np.random.randn(2, 51, 256)\n",
+    "print(x.dtype)\n",
+    "px = paddle.to_tensor(x, dtype='float32')\n",
+    "tx = torch.tensor(x, dtype=torch.float32)\n",
+    "print(px)\n",
+    "print(tx)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cooked-progressive",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "mechanical-prisoner",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n",
+    "t_norm_ff = data['norm_ff']\n",
+    "t_ff_out = data['ff_out']\n",
+    "t_ff_l_x = data['ff_l_x']\n",
+    "t_ff_l_a_x = data['ff_l_a_x']\n",
+    "t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
+    "t_ps = data['ps']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "indie-marriage",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "assured-zambia",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n",
+      "True\n",
+      "True\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "L.set_state_dict({'weight': t_ps[0].T, 'bias': t_ps[1]})\n",
+    "L2.set_state_dict({'weight': t_ps[2].T, 'bias': t_ps[3]})\n",
+    "\n",
+    "ps = []\n",
+    "for n, p in L.named_parameters():\n",
+    "   ps.append(p)\n",
+    "\n",
+    "for n, p in L2.state_dict().items():\n",
+    "    ps.append(p)\n",
+    "    \n",
+    "for p, tp in zip(ps, t_ps):\n",
+    "    print(np.allclose(p.numpy(), tp.T))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "committed-jacob",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "extreme-traffic",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "optimum-milwaukee",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "viral-indian",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n",
+      "True\n",
+      "True\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "# data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n",
+    "# t_norm_ff = data['norm_ff']\n",
+    "# t_ff_out = data['ff_out']\n",
+    "# t_ff_l_x = data['ff_l_x']\n",
+    "# t_ff_l_a_x = data['ff_l_a_x']\n",
+    "# t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
+    "# t_ps = data['ps']\n",
+    "TL = torch.nn.Linear(256, 2048)\n",
+    "TL2 = torch.nn.Linear(2048, 256)\n",
+    "TL.load_state_dict({'weight': torch.tensor(t_ps[0]), 'bias': torch.tensor(t_ps[1])})\n",
+    "TL2.load_state_dict({'weight': torch.tensor(t_ps[2]), 'bias': torch.tensor(t_ps[3])})\n",
+    "\n",
+    "# for n, p in TL.named_parameters():\n",
+    "#    print(n, p)\n",
+    "# for n, p in TL2.named_parameters():\n",
+    "#    print(n, p)\n",
+    "\n",
+    "ps = []\n",
+    "for n, p in TL.state_dict().items():\n",
+    "    ps.append(p.data.numpy())\n",
+    "    \n",
+    "for n, p in TL2.state_dict().items():\n",
+    "    ps.append(p.data.numpy())\n",
+    "    \n",
+    "for p, tp in zip(ps, t_ps):\n",
+    "    print(np.allclose(p, tp))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "skilled-vietnamese",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[-0.25528666 -0.9090747   0.12996    ...  0.02552819  0.37376517\n",
+      "  -0.558986  ]\n",
+      " [-0.45657372  0.23811203  0.33472425 ...  1.0797666  -0.7263612\n",
+      "   0.31549692]]\n",
+      "[[-0.25528657 -0.9090746   0.12996009 ...  0.02552832  0.37376505\n",
+      "  -0.5589858 ]\n",
+      " [-0.45657367  0.23811209  0.33472428 ...  1.0797666  -0.7263612\n",
+      "   0.31549698]]\n",
+      "True\n",
+      "False\n"
+     ]
+    }
+   ],
+   "source": [
+    "y = L(px)\n",
+    "print(y.numpy())\n",
+    "\n",
+    "ty = TL(tx)\n",
+    "print(ty.data.numpy())\n",
+    "print(np.allclose(px.numpy(), tx.detach().numpy()))\n",
+    "print(np.allclose(y.numpy(), ty.detach().numpy()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "incorrect-allah",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "prostate-cameroon",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "governmental-surge",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[ 0.07453135  0.0698561   0.6273111  ...  0.5845924  -0.65527105\n",
+      "   0.5881643 ]\n",
+      " [ 0.3902049  -0.17455879 -1.1802813  ... -0.36912322  0.55681896\n",
+      "  -0.11917676]]\n",
+      "[[ 0.07453132  0.06985616  0.62731117 ...  0.5845925  -0.65527105\n",
+      "   0.5881642 ]\n",
+      " [ 0.39020485 -0.17455864 -1.1802814  ... -0.3691232   0.556819\n",
+      "  -0.11917675]]\n",
+      "True\n",
+      "False\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "x = np.random.randn(2, 256)\n",
+    "px = paddle.to_tensor(x, dtype='float32')\n",
+    "tx = torch.tensor(x, dtype=torch.float32)\n",
+    "y = L(px)\n",
+    "print(y.numpy())\n",
+    "ty = TL(tx)\n",
+    "print(ty.data.numpy())\n",
+    "print(np.allclose(px.numpy(), tx.detach().numpy()))\n",
+    "print(np.allclose(y.numpy(), ty.detach().numpy()))\n",
+    "print(np.allclose(y.numpy(), ty.detach().numpy(), atol=1e-5))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "confidential-jacket",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "improved-civilization",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/.notebook/layer_norm_test.ipynb b/.notebook/layer_norm_test.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..eac3566ff0590295a1f3b742cd8d038f420500ce
--- /dev/null
+++ b/.notebook/layer_norm_test.ipynb
@@ -0,0 +1,229 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "academic-surname",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle\n",
+    "from paddle import nn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "fundamental-treasure",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parameter containing:\n",
+      "Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
+      "       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])\n",
+      "Parameter containing:\n",
+      "Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
+      "       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])\n"
+     ]
+    }
+   ],
+   "source": [
+    "L = nn.LayerNorm(256, epsilon=1e-12)\n",
+    "for p in L.parameters():\n",
+    "    print(p)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "consolidated-elephant",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "moderate-noise",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "float64\n"
+     ]
+    }
+   ],
+   "source": [
+    "x = np.random.randn(2, 51, 256)\n",
+    "print(x.dtype)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "cooked-progressive",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y = L(paddle.to_tensor(x, dtype='float32'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "optimum-milwaukee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "viral-indian",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parameter containing:\n",
+      "tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
+      "        1., 1., 1., 1.], requires_grad=True)\n",
+      "Parameter containing:\n",
+      "tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
+      "       requires_grad=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "TL = torch.nn.LayerNorm(256, eps=1e-12)\n",
+    "for p in TL.parameters():\n",
+    "    print(p)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "skilled-vietnamese",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ty = TL(torch.tensor(x, dtype=torch.float32))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "incorrect-allah",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 51,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.allclose(y.numpy(), ty.detach().numpy())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "prostate-cameroon",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "governmental-surge",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x = np.random.randn(2, 256)\n",
+    "y = L(paddle.to_tensor(x, dtype='float32'))\n",
+    "ty = TL(torch.tensor(x, dtype=torch.float32))\n",
+    "np.allclose(y.numpy(), ty.detach().numpy())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "confidential-jacket",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/.notebook/position_embeding_check.ipynb b/.notebook/position_embeding_check.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..f7410c256b69dabfdaaebb2ff6fa8e45aa83ec31
--- /dev/null
+++ b/.notebook/position_embeding_check.ipynb
@@ -0,0 +1,160 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "designing-borough",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00\n",
+      "   0.0000000e+00  0.0000000e+00]\n",
+      " [ 8.4147096e-01  8.0196178e-01  7.6172036e-01 ...  1.2409373e-04\n",
+      "   1.1547816e-04  1.0746076e-04]\n",
+      " [ 9.0929741e-01  9.5814437e-01  9.8704624e-01 ...  2.4818745e-04\n",
+      "   2.3095631e-04  2.1492151e-04]\n",
+      " ...\n",
+      " [ 3.7960774e-01  7.4510968e-01  7.3418564e-01 ...  1.2036801e-02\n",
+      "   1.1201146e-02  1.0423505e-02]\n",
+      " [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ...  1.2160885e-02\n",
+      "   1.1316618e-02  1.0530960e-02]\n",
+      " [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ...  1.2284970e-02\n",
+      "   1.1432089e-02  1.0638415e-02]]\n",
+      "True\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import math\n",
+    "import numpy as np\n",
+    "\n",
+    "max_len=100\n",
+    "d_model=256\n",
+    "\n",
+    "pe = torch.zeros(max_len, d_model)\n",
+    "position = torch.arange(0, max_len,\n",
+    "                        dtype=torch.float32).unsqueeze(1)\n",
+    "toruch_position = position\n",
+    "div_term = torch.exp(\n",
+    "    torch.arange(0, d_model, 2, dtype=torch.float32) *\n",
+    "    -(math.log(10000.0) / d_model))\n",
+    "tourch_div_term = div_term.cpu().detach().numpy()\n",
+    "\n",
+    "\n",
+    "\n",
+    "torhc_sin = torch.sin(position * div_term)\n",
+    "torhc_cos = torch.cos(position * div_term)\n",
+    "print(torhc_sin.cpu().detach().numpy())\n",
+    "np_sin = np.sin((position * div_term).cpu().detach().numpy())\n",
+    "np_cos = np.cos((position * div_term).cpu().detach().numpy())\n",
+    "print(np.allclose(np_sin, torhc_sin.cpu().detach().numpy()))\n",
+    "print(np.allclose(np_cos, torhc_cos.cpu().detach().numpy()))\n",
+    "pe[:, 0::2] = torhc_sin\n",
+    "pe[:, 1::2] = torhc_cos\n",
+    "tourch_pe = pe.cpu().detach().numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "swiss-referral",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n",
+      "True\n",
+      "False\n",
+      "False\n",
+      "False\n",
+      "False\n"
+     ]
+    }
+   ],
+   "source": [
+    "import paddle\n",
+    "ppe = paddle.zeros([max_len, d_model])\n",
+    "position = paddle.arange(0, max_len,\n",
+    "                        dtype='float32').unsqueeze(1)\n",
+    "print(np.allclose(position.numpy(), toruch_position))\n",
+    "div_term = paddle.exp(\n",
+    "    paddle.arange(0, d_model, 2, dtype='float32') *\n",
+    "    -(math.log(10000.0) / d_model))\n",
+    "print(np.allclose(div_term.numpy(), tourch_div_term))\n",
+    "\n",
+    "\n",
+    "\n",
+    "p_sin = paddle.sin(position * div_term)\n",
+    "p_cos = paddle.cos(position * div_term)\n",
+    "print(np.allclose(np_sin, p_sin.numpy(), rtol=1.e-6, atol=0))\n",
+    "print(np.allclose(np_cos, p_cos.numpy(), rtol=1.e-6, atol=0))\n",
+    "ppe[:, 0::2] = p_sin\n",
+    "ppe[:, 1::2] = p_cos\n",
+    "print(np.allclose(p_sin.numpy(), torhc_sin.cpu().detach().numpy()))\n",
+    "print(np.allclose(p_cos.numpy(), torhc_cos.cpu().detach().numpy()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "integrated-boards",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "False\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(np.allclose(ppe.numpy(), pe.numpy()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "flying-reserve",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "revised-divide",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/.notebook/u2_model.ipynb b/.notebook/u2_model.ipynb
index f9e7c1eecd879b75f231dba3fed860879361c8ed..4f2c9632f4634e83924515925aafd2c52c20a270 100644
--- a/.notebook/u2_model.ipynb
+++ b/.notebook/u2_model.ipynb
@@ -100,7 +100,7 @@
      "text": [
       "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
       "  and should_run_async(code)\n",
-      "[INFO 2021/04/19 06:57:01 u2.py:834] U2 Encoder type: conformer\n"
+      "[INFO 2021/04/20 03:32:21 u2.py:834] U2 Encoder type: conformer\n"
      ]
     },
     {
@@ -1439,13 +1439,7 @@
       "decoder.decoders.3.feed_forward.w_2.weight | [2048, 256] | 524288\n",
       "decoder.decoders.3.feed_forward.w_2.bias | [256] | 256\n",
       "decoder.decoders.3.norm1.weight | [256] | 256\n",
-      "decoder.decoders.3.norm1.bias | [256] | 256\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "decoder.decoders.3.norm1.bias | [256] | 256\n",
       "decoder.decoders.3.norm2.weight | [256] | 256\n",
       "decoder.decoders.3.norm2.bias | [256] | 256\n",
       "decoder.decoders.3.norm3.weight | [256] | 256\n",
@@ -1526,7 +1520,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "id": "ruled-invitation",
    "metadata": {},
    "outputs": [
@@ -2184,6 +2178,16 @@
     "print(model)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "fossil-means",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load feat"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 7,
@@ -2194,13 +2198,18 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "compute_cmvn_loader_test.ipynb         jit_infer.ipynb\r\n",
-      "dataloader.ipynb                       mask_and_masked_fill_test.ipynb\r\n",
-      "dataloader_with_tokens_tokenids.ipynb  model.npz\r\n",
-      "data.npz                               python_test.ipynb\r\n",
-      "decoder.npz                            train_test.ipynb\r\n",
-      "encoder.npz                            u2_model.ipynb\r\n",
-      "hack_api_test.ipynb\r\n"
+      "compute_cmvn_loader_test.ipynb         encoder.npz\r\n",
+      "dataloader.ipynb                       hack_api_test.ipynb\r\n",
+      "dataloader_with_tokens_tokenids.ipynb  jit_infer.ipynb\r\n",
+      "data.npz                               layer_norm_test.ipynb\r\n",
+      "decoder.npz                            Linear_test.ipynb\r\n",
+      "enc_0_ff_out.npz                       mask_and_masked_fill_test.ipynb\r\n",
+      "enc_0_norm_ff.npz                      model.npz\r\n",
+      "enc_0.npz                              position_embeding_check.ipynb\r\n",
+      "enc_0_selattn_out.npz                  python_test.ipynb\r\n",
+      "enc_2.npz                              train_test.ipynb\r\n",
+      "enc_all.npz                            u2_model.ipynb\r\n",
+      "enc_embed.npz\r\n"
      ]
     }
    ],
@@ -2213,21 +2222,6 @@
    "execution_count": 8,
    "id": "abroad-oracle",
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = np.load('.notebook/data.npz', allow_pickle=True)\n",
-    "keys=data['keys']\n",
-    "feat=data['feat']\n",
-    "feat_len=data['feat_len']\n",
-    "text=data['text']\n",
-    "text_len=data['text_len']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "false-instrument",
-   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -2311,6 +2305,12 @@
     }
    ],
    "source": [
+    "data = np.load('.notebook/data.npz', allow_pickle=True)\n",
+    "keys=data['keys']\n",
+    "feat=data['feat']\n",
+    "feat_len=data['feat_len']\n",
+    "text=data['text']\n",
+    "text_len=data['text_len']\n",
     "print(keys)\n",
     "print(feat.shape)\n",
     "print(feat)\n",
@@ -2321,7 +2321,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
+   "id": "false-instrument",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
    "id": "arctic-proxy",
    "metadata": {},
    "outputs": [],
@@ -2400,7 +2408,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
+   "id": "seasonal-switch",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
    "id": "defined-brooks",
    "metadata": {},
    "outputs": [
@@ -2408,17 +2424,23 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "compute_cmvn_loader_test.ipynb\t       jit_infer.ipynb\r\n",
-      "dataloader.ipynb\t\t       mask_and_masked_fill_test.ipynb\r\n",
-      "dataloader_with_tokens_tokenids.ipynb  model.npz\r\n",
-      "data.npz\t\t\t       python_test.ipynb\r\n",
-      "decoder.npz\t\t\t       train_test.ipynb\r\n",
-      "encoder.npz\t\t\t       u2_model.ipynb\r\n",
-      "hack_api_test.ipynb\r\n"
+      "compute_cmvn_loader_test.ipynb\t       encoder.npz\r\n",
+      "dataloader.ipynb\t\t       hack_api_test.ipynb\r\n",
+      "dataloader_with_tokens_tokenids.ipynb  jit_infer.ipynb\r\n",
+      "data.npz\t\t\t       layer_norm_test.ipynb\r\n",
+      "decoder.npz\t\t\t       Linear_test.ipynb\r\n",
+      "enc_0_ff_out.npz\t\t       mask_and_masked_fill_test.ipynb\r\n",
+      "enc_0_norm_ff.npz\t\t       model.npz\r\n",
+      "enc_0.npz\t\t\t       position_embeding_check.ipynb\r\n",
+      "enc_0_selattn_out.npz\t\t       python_test.ipynb\r\n",
+      "enc_2.npz\t\t\t       train_test.ipynb\r\n",
+      "enc_all.npz\t\t\t       u2_model.ipynb\r\n",
+      "enc_embed.npz\r\n"
      ]
     }
    ],
    "source": [
+    "# load model param\n",
     "!ls .notebook\n",
     "data = np.load('.notebook/model.npz', allow_pickle=True)\n",
     "state_dict = data['state'].item()\n",
@@ -2445,7 +2467,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 11,
    "id": "confident-piano",
    "metadata": {},
    "outputs": [
@@ -2478,6 +2500,7 @@
     }
    ],
    "source": [
+    "# compute loss\n",
     "import paddle\n",
     "feat=paddle.to_tensor(feat)\n",
     "feat_len=paddle.to_tensor(feat_len, dtype='int64')\n",
@@ -2492,12 +2515,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "id": "better-senator",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# tensor(142.4858, device='cuda:0', grad_fn=<AddBackward0>) tensor(41.8416, device='cuda:0', grad_fn=<DivBackward0>) tensor(377.3222, device='cuda:0', grad_fn=<DivBackward0>)"
+    "# tensor(142.4888, device='cuda:0', grad_fn=<AddBackward0>) \n",
+    "# tensor(41.8415, device='cuda:0', grad_fn=<DivBackward0>) \n",
+    "# tensor(377.3326, device='cuda:0', grad_fn=<DivBackward0>)\n",
+    "# 142.4888 41.84146 377.33258"
    ]
   },
   {
@@ -2510,7 +2536,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "id": "olympic-problem",
    "metadata": {},
    "outputs": [
@@ -2532,23 +2558,16 @@
     }
    ],
    "source": [
+    "# ecnoder\n",
     "encoder_out, encoder_mask = model.encoder(feat, feat_len)\n",
     "print(encoder_out.shape)\n",
     "print(encoder_mask.shape)\n",
-    "print(encoder_out[0])\n"
+    "print(encoder_out[0])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "cubic-values",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "id": "shaped-alaska",
    "metadata": {},
    "outputs": [
@@ -2571,7 +2590,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "id": "federal-rover",
    "metadata": {},
    "outputs": [
@@ -2589,7 +2608,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "id": "regulated-interstate",
    "metadata": {},
    "outputs": [
@@ -2610,18 +2629,38 @@
       " [-1.165412    0.6819976   0.69394535 ...  1.2238353   0.80282927\n",
       "   1.4506509 ]\n",
       " [-1.2732087   0.71458083  0.7581961  ...  0.9415482   0.877484\n",
-      "   1.2623053 ]]\n"
+      "   1.2623053 ]]\n",
+      "----\n",
+      "[[-0.7019418   0.56254166  0.6880346  ...  1.1237322   0.78039235\n",
+      "   1.1369387 ]\n",
+      " [-0.7787781   0.39126658  0.71887815 ...  1.2518822   0.8861679\n",
+      "   1.3173453 ]\n",
+      " [-0.95908946  0.6346025   0.87671334 ...  0.9818373   0.7440108\n",
+      "   1.2903266 ]\n",
+      " ...\n",
+      " [-1.073225    0.67236906  0.9230311  ...  0.9075456   0.81767166\n",
+      "   1.3239657 ]\n",
+      " [-1.1654116   0.68199694  0.69394493 ...  1.2238349   0.8028289\n",
+      "   1.4506508 ]\n",
+      " [-1.2732095   0.7145803   0.7581956  ...  0.9415491   0.87748396\n",
+      "   1.2623051 ]]\n",
+      "True\n",
+      "False\n"
      ]
     }
    ],
    "source": [
     "print(np.allclose(torch_encoder_out, encoder_out.numpy()))\n",
-    "print(torch_encoder_out[0])"
+    "print(torch_encoder_out[0])\n",
+    "print(\"----\")\n",
+    "print(encoder_out.numpy()[0])\n",
+    "print(np.allclose(torch_encoder_out, encoder_out.numpy(), atol=1e-5, rtol=1e-6))\n",
+    "print(np.allclose(torch_encoder_out, encoder_out.numpy(), atol=1e-6, rtol=1e-6))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
    "id": "proof-scheduling",
    "metadata": {},
    "outputs": [
@@ -2630,23 +2669,23 @@
      "output_type": "stream",
      "text": [
       "Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
-      "       [377.32220459])\n",
+      "       [377.33258057])\n",
       "[1.]\n",
-      "[[ 3.1708076e+00 -1.5184805e-02  4.9524564e-02 ... -2.4678309e-03\n",
-      "  -5.9236852e-03 -7.2192554e-03]\n",
-      " [-1.7474542e+00  7.7654729e-03 -4.5106117e-02 ...  9.8463835e-04\n",
-      "   2.4569160e-03  2.2863639e-03]\n",
-      " [-2.3707268e+00  1.3136451e-02 -2.6281785e-02 ...  2.2738585e-03\n",
-      "   5.7726162e-03  7.4628354e-03]\n",
+      "[[ 3.16902876e+00 -1.51763987e-02  4.91095744e-02 ... -2.47971853e-03\n",
+      "  -5.93360700e-03 -7.26609165e-03]\n",
+      " [-1.74184477e+00  7.75874173e-03 -4.49434854e-02 ...  9.92412097e-04\n",
+      "   2.46337592e-03  2.31892057e-03]\n",
+      " [-2.33343339e+00  1.30475955e-02 -2.66557075e-02 ...  2.27532350e-03\n",
+      "   5.76924905e-03  7.48788286e-03]\n",
       " ...\n",
-      " [-4.4350743e+00  2.4916438e-02 -9.0385124e-02 ...  4.4534383e-03\n",
-      "   1.1696636e-02  1.4515720e-02]\n",
-      " [-3.3899918e+00  1.7287316e-02 -6.3514955e-02 ...  3.2612216e-03\n",
-      "   8.5411733e-03  1.0692922e-02]\n",
-      " [-6.6964636e+00  3.5097409e-02 -1.2437013e-01 ...  6.3515711e-03\n",
-      "   1.6078018e-02  2.0318989e-02]]\n",
-      "[-4.4341431e+00  2.3347888e-02 -9.3501516e-02 ...  4.2512305e-03\n",
-      "  1.0928102e-02  1.3750527e-02]\n"
+      " [-4.30358458e+00  2.46054661e-02 -9.00950655e-02 ...  4.43156436e-03\n",
+      "   1.16122244e-02  1.44715561e-02]\n",
+      " [-3.36921120e+00  1.73153952e-02 -6.36872873e-02 ...  3.28363618e-03\n",
+      "   8.58010259e-03  1.07794888e-02]\n",
+      " [-6.62045336e+00  3.49955931e-02 -1.23962618e-01 ...  6.36671018e-03\n",
+      "   1.60814095e-02  2.03891303e-02]]\n",
+      "[-4.3777819e+00  2.3245810e-02 -9.3339294e-02 ...  4.2569344e-03\n",
+      "  1.0919910e-02  1.3787797e-02]\n"
      ]
     }
    ],
@@ -2679,23 +2718,25 @@
     "print(loss_ctc.grad)\n",
     "print(model.ctc.ctc_lo.weight.grad)\n",
     "print(model.ctc.ctc_lo.bias.grad)\n",
-    "# tensor(377.3222, device='cuda:0', grad_fn=<DivBackward0>)\n",
+    "\n",
+    "\n",
+    "# tensor(377.3326, device='cuda:0', grad_fn=<DivBackward0>)\n",
     "# None\n",
-    "# tensor([[ 3.1708e+00, -1.7475e+00, -2.3708e+00,  ..., -4.4351e+00,\n",
-    "#          -3.3900e+00, -6.6965e+00],\n",
-    "#         [-1.5185e-02,  7.7655e-03,  1.3137e-02,  ...,  2.4917e-02,\n",
-    "#           1.7287e-02,  3.5098e-02],\n",
-    "#         [ 4.9522e-02, -4.5104e-02, -2.6280e-02,  ..., -9.0381e-02,\n",
-    "#          -6.3512e-02, -1.2436e-01],\n",
-    "#         ...,\n",
-    "#         [-2.4678e-03,  9.8464e-04,  2.2739e-03,  ...,  4.4535e-03,\n",
-    "#           3.2612e-03,  6.3516e-03],\n",
-    "#         [-5.9237e-03,  2.4569e-03,  5.7726e-03,  ...,  1.1697e-02,\n",
-    "#           8.5412e-03,  1.6078e-02],\n",
-    "#         [-7.2193e-03,  2.2864e-03,  7.4629e-03,  ...,  1.4516e-02,\n",
-    "#           1.0693e-02,  2.0319e-02]], device='cuda:0')\n",
-    "# tensor([-4.4342e+00,  2.3348e-02, -9.3497e-02,  ...,  4.2513e-03,\n",
-    "#          1.0928e-02,  1.3751e-02], device='cuda:0')"
+    "# [[ 3.16902351e+00 -1.51765049e-02  4.91097234e-02 ... -2.47973716e-03\n",
+    "#   -5.93366381e-03 -7.26613170e-03]\n",
+    "#  [-1.74185038e+00  7.75875803e-03 -4.49435972e-02 ...  9.92415240e-04\n",
+    "#    2.46338220e-03  2.31891591e-03]\n",
+    "#  [-2.33343077e+00  1.30476682e-02 -2.66557615e-02 ...  2.27533933e-03\n",
+    "#    5.76929189e-03  7.48792710e-03]\n",
+    "#  ...\n",
+    "#  [-4.30356789e+00  2.46056803e-02 -9.00955945e-02 ...  4.43160534e-03\n",
+    "#    1.16123557e-02  1.44716976e-02]\n",
+    "#  [-3.36919212e+00  1.73155665e-02 -6.36875406e-02 ...  3.28367390e-03\n",
+    "#    8.58021621e-03  1.07796099e-02]\n",
+    "#  [-6.62039661e+00  3.49958315e-02 -1.23963736e-01 ...  6.36674836e-03\n",
+    "#    1.60815325e-02  2.03892551e-02]]\n",
+    "# [-4.3777566e+00  2.3245990e-02 -9.3339972e-02 ...  4.2569702e-03\n",
+    "#   1.0920014e-02  1.3787906e-02]"
    ]
   },
   {
@@ -2708,7 +2749,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
    "id": "synthetic-hungarian",
    "metadata": {},
    "outputs": [
@@ -2717,7 +2758,7 @@
      "output_type": "stream",
      "text": [
       "Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
-      "       [41.84160995]) 0.0\n"
+      "       [41.84146118]) 0.0\n"
      ]
     }
    ],
@@ -2730,17 +2771,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 19,
    "id": "indian-sweden",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# encoder, decoder不对齐"
-   ]
+   "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 202,
    "id": "marine-cuisine",
    "metadata": {},
    "outputs": [
@@ -2772,7 +2811,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 180,
    "id": "several-result",
    "metadata": {},
    "outputs": [],
@@ -2833,7 +2872,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 181,
    "id": "possible-bulgaria",
    "metadata": {},
    "outputs": [
@@ -2890,7 +2929,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 285,
    "id": "north-walter",
    "metadata": {},
    "outputs": [
@@ -2898,25 +2937,49 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[16, 7, 4233]\n",
-      "Tensor(shape=[7, 4233], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
-      "       [[-0.37638962, -0.82272029,  0.74276292, ...,  0.34200522,  0.01503509,  0.40337229],\n",
-      "        [-0.87386417, -0.31389427,  0.41987872, ...,  0.37723723, -0.14352795, -1.00236630],\n",
-      "        [-0.43505096,  0.03450463, -0.28710306, ...,  0.07727426, -1.16722453, -0.26848495],\n",
-      "        ...,\n",
-      "        [ 0.42471474,  0.58885634,  0.02020410, ...,  0.37405482,  0.04546990, -0.37139422],\n",
-      "        [-0.37978464, -0.81084198,  0.75725073, ...,  0.26038912, -0.00079346,  0.42537683],\n",
-      "        [-0.38279879, -0.81206709,  0.74943423, ...,  0.26172996, -0.00104988,  0.42678767]])\n",
-      "False\n"
+      "False\n",
+      "True\n",
+      "False\n",
+      "[[-3.76389682e-01 -8.22720408e-01  7.42762923e-01 ...  3.42005253e-01\n",
+      "   1.50350705e-02  4.03372347e-01]\n",
+      " [-8.73864174e-01 -3.13894272e-01  4.19878662e-01 ...  3.77237231e-01\n",
+      "  -1.43528014e-01 -1.00236630e+00]\n",
+      " [-4.35050905e-01  3.45046446e-02 -2.87102997e-01 ...  7.72742853e-02\n",
+      "  -1.16722476e+00 -2.68485069e-01]\n",
+      " ...\n",
+      " [ 4.24714804e-01  5.88856399e-01  2.02039629e-02 ...  3.74054879e-01\n",
+      "   4.54700664e-02 -3.71394157e-01]\n",
+      " [-3.79784584e-01 -8.10841978e-01  7.57250786e-01 ...  2.60389000e-01\n",
+      "  -7.93404877e-04  4.25376773e-01]\n",
+      " [-3.82798851e-01 -8.12067091e-01  7.49434292e-01 ...  2.61730075e-01\n",
+      "  -1.04988366e-03  4.26787734e-01]]\n",
+      "---\n",
+      "[[-3.7638968e-01 -8.2272053e-01  7.4276292e-01 ...  3.4200522e-01\n",
+      "   1.5034772e-02  4.0337229e-01]\n",
+      " [-8.7386459e-01 -3.1389427e-01  4.1987866e-01 ...  3.7723729e-01\n",
+      "  -1.4352810e-01 -1.0023664e+00]\n",
+      " [-4.3505096e-01  3.4504786e-02 -2.8710306e-01 ...  7.7274129e-02\n",
+      "  -1.1672243e+00 -2.6848501e-01]\n",
+      " ...\n",
+      " [ 4.2471480e-01  5.8885634e-01  2.0203922e-02 ...  3.7405500e-01\n",
+      "   4.5470044e-02 -3.7139410e-01]\n",
+      " [-3.7978446e-01 -8.1084180e-01  7.5725085e-01 ...  2.6038891e-01\n",
+      "  -7.9347193e-04  4.2537671e-01]\n",
+      " [-3.8279903e-01 -8.1206715e-01  7.4943429e-01 ...  2.6173013e-01\n",
+      "  -1.0499060e-03  4.2678756e-01]]\n"
      ]
     }
    ],
    "source": [
     "decoder_out, _ = model.decoder(encoder_out, encoder_mask, ys_in_pad,\n",
     "                                      ys_in_lens)\n",
-    "print(decoder_out.shape)\n",
-    "print(decoder_out[0])\n",
-    "print(np.allclose(decoder_out.numpy(), torch_decoder_out))"
+    "\n",
+    "print(np.allclose(decoder_out.numpy(), torch_decoder_out))\n",
+    "print(np.allclose(decoder_out.numpy(), torch_decoder_out, atol=1e-6))\n",
+    "print(np.allclose(decoder_out.numpy(), torch_decoder_out, atol=1e-7))\n",
+    "print(decoder_out.numpy()[0])\n",
+    "print('---')\n",
+    "print(torch_decoder_out[0])"
    ]
   },
   {
@@ -2945,13 +3008,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 183,
    "id": "assisted-fortune",
    "metadata": {},
    "outputs": [],
    "source": [
     "from paddle import nn\n",
     "import paddle\n",
+    "from paddle.nn import functional as F\n",
+    "\n",
     "class LabelSmoothingLoss(nn.Layer):\n",
     "\n",
     "    def __init__(self,\n",
@@ -3016,7 +3081,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 184,
    "id": "weighted-delight",
    "metadata": {},
    "outputs": [
@@ -3034,7 +3099,7 @@
       "        [0.00002363, 0.00002363, 0.00002363, ..., 0.00002363, 0.00002363, 0.00002363],\n",
       "        [0.00002363, 0.00002363, 0.00002363, ..., 0.00002363, 0.00002363, 0.00002363]])\n",
       "Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [41.84160995])\n",
+      "       [41.84146118])\n",
       "VarType.INT64\n"
      ]
     }
@@ -3049,7 +3114,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 286,
    "id": "dress-shelter",
    "metadata": {},
    "outputs": [
@@ -3058,7 +3123,7 @@
      "output_type": "stream",
      "text": [
       "Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [41.84160995])\n",
+      "       [41.84146118])\n",
       "Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
       "       [41.84146118])\n",
       "4233\n",
@@ -3094,7 +3159,39 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": null,
+   "id": "going-hungary",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "naughty-citizenship",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "experimental-emerald",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "adverse-saskatchewan",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
    "id": "speaking-shelf",
    "metadata": {},
    "outputs": [],
@@ -3261,7 +3358,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 28,
    "id": "sharp-municipality",
    "metadata": {},
    "outputs": [],
@@ -3351,7 +3448,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 29,
    "id": "tutorial-syndication",
    "metadata": {},
    "outputs": [],
@@ -3377,7 +3474,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 30,
    "id": "fuzzy-register",
    "metadata": {},
    "outputs": [
@@ -3397,7 +3494,55 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": null,
+   "id": "explicit-triumph",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "humanitarian-belgium",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dying-proposal",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "honest-quick",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bound-cholesterol",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "viral-packaging",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 203,
    "id": "balanced-locator",
    "metadata": {},
    "outputs": [
@@ -3431,7 +3576,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 204,
    "id": "induced-proposition",
    "metadata": {},
    "outputs": [
@@ -3499,7 +3644,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 205,
    "id": "cutting-julian",
    "metadata": {},
    "outputs": [
@@ -3833,7 +3978,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 206,
    "id": "friendly-nightlife",
    "metadata": {},
    "outputs": [
@@ -3940,7 +4085,15 @@
       "         ...,\n",
       "         [-5.08208990,  8.59203339, -4.21366739, ...,  6.26925707,  0.05394945, -2.92699170],\n",
       "         [-5.08208990,  8.59203339, -4.21366739, ...,  6.26925707,  0.05394945, -2.92699170],\n",
-      "         [-5.08208990,  8.59203339, -4.21366739, ...,  6.26925707,  0.05394945, -2.92699170]]])\n"
+      "         [-5.08208990,  8.59203339, -4.21366739, ...,  6.26925707,  0.05394945, -2.92699170]]])\n",
+      "Tensor(shape=[1, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
+      "       [[[ 0.        ,  1.        ,  0.        , ...,  1.        ,  0.        ,  1.        ],\n",
+      "         [ 0.84147102,  0.54030228,  0.80196184, ...,  1.        ,  0.00010746,  1.        ],\n",
+      "         [ 0.90929747, -0.41614681,  0.95814437, ...,  1.        ,  0.00021492,  1.        ],\n",
+      "         ...,\n",
+      "         [-0.76825470, -0.64014435,  0.63279730, ...,  0.99998462,  0.00515809,  0.99998671],\n",
+      "         [-0.95375264,  0.30059254,  0.99899054, ...,  0.99998397,  0.00526555,  0.99998611],\n",
+      "         [-0.26237485,  0.96496606,  0.56074661, ...,  0.99998331,  0.00537301,  0.99998558]]])\n"
      ]
     }
    ],
@@ -3949,69 +4102,106 @@
     "x = model.encoder.embed.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))\n",
     "print(x)\n",
     "x, pos_emb = model.encoder.embed.pos_enc(x, 0)\n",
-    "print(x)"
+    "print(x)\n",
+    "print(pos_emb)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
-   "id": "exempt-cloud",
+   "execution_count": 207,
+   "id": "guilty-cache",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Tensor(shape=[16, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
-      "       [[[-0.54821998,  2.28660274, -1.07501972, ...,  1.45036042,  0.28950194, -0.69454080],\n",
-      "         [-0.80125421,  1.76875579, -1.66388774, ...,  1.83315802,  0.67914939, -0.19995420],\n",
-      "         [-1.71124649,  2.70574546, -1.33634126, ...,  1.23364413,  0.18697014, -0.57351983],\n",
-      "         ...,\n",
-      "         [-0.96968573,  2.31294894, -0.87524825, ...,  0.85838526,  0.48533469, -0.41773027],\n",
-      "         [-1.36094308,  2.17788029, -1.78127730, ...,  2.09278774,  0.25282228, -0.36496443],\n",
-      "         [-1.69674826,  2.35438418, -1.74168527, ...,  1.36695099,  0.59511113, -0.74147725]],\n",
-      "\n",
-      "        [[-1.98284078,  2.31777000, -0.90785271, ...,  0.41170627,  0.50061619,  0.08721463],\n",
-      "         [-0.76404583,  1.35577726, -1.36125672, ...,  0.73170459,  0.67842603,  0.16851945],\n",
-      "         [-0.95044655,  1.60376561, -1.30299675, ...,  0.57544005,  0.26769355,  0.33433008],\n",
-      "         ...,\n",
-      "         [-1.47567701,  2.53171301, -1.23207152, ...,  1.29967308,  0.50191855, -0.10343577],\n",
-      "         [-1.17308092,  2.31722355, -1.25421047, ...,  1.73911047,  0.21709818, -0.44447583],\n",
-      "         [-1.26996231,  3.22289634, -0.88719147, ...,  1.64605021,  0.09731755, -0.76786882]],\n",
-      "\n",
-      "        [[-0.58725590,  1.42905438, -1.39500988, ...,  0.21024795,  0.10272825,  0.09179455],\n",
-      "         [ 0.17428070,  1.78342295, -1.64217877, ...,  0.81127012,  0.31371105,  0.56344515],\n",
-      "         [-0.34916472,  1.83103430, -1.06851172, ...,  0.69243336,  0.13782299,  0.45937473],\n",
-      "         ...,\n",
-      "         [-1.08686376,  2.30020404, -1.26384079, ...,  1.79982817,  0.51338923, -0.52227837],\n",
-      "         [-1.26144814,  2.72396612, -1.37337780, ...,  1.44453299,  0.57420933, -0.33201432],\n",
-      "         [-2.20676827,  4.34621811, -3.82886696, ...,  2.14260173,  1.20336640, -1.37951219]],\n",
-      "\n",
-      "        ...,\n",
-      "\n",
-      "        [[-0.39141566,  1.85533464, -0.57471782, ...,  1.00623512,  0.46320182, -1.04523599],\n",
-      "         [-0.86054784,  2.01717925, -1.44368529, ...,  1.45262301,  0.16571884,  0.59231722],\n",
-      "         [-0.73066384,  2.28405023, -1.06989920, ...,  1.58249414, -0.09795550,  0.55030036],\n",
+      "Tensor(shape=[1, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
+      "       [[[ 0.        ,  1.        ,  0.        , ...,  1.        ,  0.        ,  1.        ],\n",
+      "         [ 0.84147102,  0.54030228,  0.80196184, ...,  1.        ,  0.00010746,  1.        ],\n",
+      "         [ 0.90929747, -0.41614681,  0.95814437, ...,  1.        ,  0.00021492,  1.        ],\n",
       "         ...,\n",
-      "         [-5.08208990,  8.59203339, -4.21366739, ...,  6.26925707,  0.05394945, -2.92699170],\n",
-      "         [-5.08208990,  8.59203339, -4.21366739, ...,  6.26925707,  0.05394945, -2.92699170],\n",
-      "         [-5.08208990,  8.59203339, -4.21366739, ...,  6.26925707,  0.05394945, -2.92699170]],\n",
-      "\n",
-      "        [[-0.16194311,  0.62550521, -1.13234293, ...,  0.07242929, -0.22042468,  0.46362036],\n",
-      "         [-0.08306468,  0.57504302, -1.09298003, ...,  0.91096652, -0.06501988,  0.72986233],\n",
-      "         [-0.28202093,  0.08014385, -0.94177192, ...,  0.33794850, -0.11664233,  0.44514441],\n",
-      "         ...,\n",
-      "         [-5.08208990,  8.59203339, -4.21366739, ...,  6.26925707,  0.05394945, -2.92699170],\n",
-      "         [-5.08208990,  8.59203339, -4.21366739, ...,  6.26925707,  0.05394945, -2.92699170],\n",
-      "         [-5.08208990,  8.59203339, -4.21366739, ...,  6.26925707,  0.05394945, -2.92699170]],\n",
-      "\n",
-      "        [[-0.54584920, -0.69092435, -1.35965478, ..., -0.78182435,  0.68747747,  0.98427159],\n",
-      "         [ 0.04212743, -1.10618520, -1.43891501, ..., -0.02385022,  0.91146135,  0.52870303],\n",
-      "         [-0.29093450, -0.18858244, -1.54873240, ..., -0.13923697,  0.05795169,  0.30663735],\n",
-      "         ...,\n",
-      "         [-5.08208990,  8.59203339, -4.21366739, ...,  6.26925707,  0.05394945, -2.92699170],\n",
-      "         [-5.08208990,  8.59203339, -4.21366739, ...,  6.26925707,  0.05394945, -2.92699170],\n",
-      "         [-5.08208990,  8.59203339, -4.21366739, ...,  6.26925707,  0.05394945, -2.92699170]]])\n"
+      "         [-0.76825470, -0.64014435,  0.63279730, ...,  0.99998462,  0.00515809,  0.99998671],\n",
+      "         [-0.95375264,  0.30059254,  0.99899054, ...,  0.99998397,  0.00526555,  0.99998611],\n",
+      "         [-0.26237485,  0.96496606,  0.56074661, ...,  0.99998331,  0.00537301,  0.99998558]]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(pos_emb)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 208,
+   "id": "iraqi-payday",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[[ 0.0000000e+00  1.0000000e+00  0.0000000e+00 ...  1.0000000e+00\n",
+      "    0.0000000e+00  1.0000000e+00]\n",
+      "  [ 8.4147096e-01  5.4030234e-01  8.0196178e-01 ...  1.0000000e+00\n",
+      "    1.0746076e-04  1.0000000e+00]\n",
+      "  [ 9.0929741e-01 -4.1614684e-01  9.5814437e-01 ...  1.0000000e+00\n",
+      "    2.1492151e-04  1.0000000e+00]\n",
+      "  ...\n",
+      "  [ 9.5625257e-01 -2.9254240e-01  4.8925215e-01 ...  8.3807874e-01\n",
+      "    5.1154459e-01  8.5925674e-01]\n",
+      "  [ 2.7049953e-01 -9.6272010e-01  9.9170387e-01 ...  8.3801574e-01\n",
+      "    5.1163691e-01  8.5920173e-01]\n",
+      "  [-6.6394955e-01 -7.4777740e-01  6.9544029e-01 ...  8.3795273e-01\n",
+      "    5.1172924e-01  8.5914677e-01]]]\n",
+      "[1, 5000, 256]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import math\n",
+    "import numpy as np\n",
+    "\n",
+    "max_len=5000\n",
+    "d_model=256\n",
+    "\n",
+    "pe = torch.zeros(max_len, d_model)\n",
+    "position = torch.arange(0, max_len,\n",
+    "                        dtype=torch.float32).unsqueeze(1)\n",
+    "toruch_position = position\n",
+    "div_term = torch.exp(\n",
+    "    torch.arange(0, d_model, 2, dtype=torch.float32) *\n",
+    "    -(math.log(10000.0) / d_model))\n",
+    "tourch_div_term = div_term.cpu().detach().numpy()\n",
+    "\n",
+    "torhc_sin = torch.sin(position * div_term)\n",
+    "torhc_cos = torch.cos(position * div_term)\n",
+    "\n",
+    "np_sin = np.sin((position * div_term).cpu().detach().numpy())\n",
+    "np_cos = np.cos((position * div_term).cpu().detach().numpy())\n",
+    "pe[:, 0::2] = torhc_sin\n",
+    "pe[:, 1::2] = torhc_cos\n",
+    "pe = pe.unsqueeze(0) \n",
+    "tourch_pe = pe.cpu().detach().numpy()\n",
+    "print(tourch_pe)\n",
+    "bak_pe = model.encoder.embed.pos_enc.pe\n",
+    "print(bak_pe.shape)\n",
+    "model.encoder.embed.pos_enc.pe = paddle.to_tensor(tourch_pe)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 210,
+   "id": "exempt-cloud",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n",
+      "True\n"
      ]
     }
    ],
@@ -4020,7 +4210,12 @@
     "masks = make_non_pad_mask(feat_len).unsqueeze(1)\n",
     "\n",
     "xs, pos_emb, masks = model.encoder.embed(xs, masks.type_as(xs), offset=0)\n",
-    "print(xs)"
+    "#print(xs)\n",
+    "data = np.load(\".notebook/enc_embed.npz\")\n",
+    "torch_pos_emb=data['pos_emb']\n",
+    "torch_xs = data['embed_out']\n",
+    "print(np.allclose(xs.numpy(), torch_xs))\n",
+    "print(np.allclose(pos_emb.numpy(), torch_pos_emb))"
    ]
   },
   {
@@ -4029,45 +4224,361 @@
    "id": "composite-involvement",
    "metadata": {},
    "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 269,
+   "id": "handed-harris",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n",
+      "True\n",
+      "True\n",
+      "True\n",
+      "True\n",
+      "True\n",
+      "True\n",
+      "False\n",
+      "True\n",
+      "[256, 2048]\n",
+      "[2048]\n",
+      "[2048, 256]\n",
+      "[256]\n",
+      "--------ff-------\n",
+      "True\n",
+      "False\n",
+      "False\n",
+      "False\n",
+      "False\n",
+      "True\n",
+      "linear_714.w_0 True\n",
+      "linear_714.b_0 True\n",
+      "linear_715.w_0 True\n",
+      "linear_715.b_0 True\n",
+      "False\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "xs = model.encoder.global_cmvn(feat)\n",
+    "masks = make_non_pad_mask(feat_len).unsqueeze(1)\n",
+    "\n",
+    "xs, pos_emb, masks = model.encoder.embed(xs, masks.type_as(xs), offset=0)\n",
+    "masks = masks.astype(paddle.bool)\n",
+    "mask_pad = masks.logical_not()\n",
+    "decoding_chunk_size=0\n",
+    "num_decoding_left_chunks=-1\n",
+    "chunk_masks = add_optional_chunk_mask(\n",
+    "            xs, masks, model.encoder.use_dynamic_chunk, model.encoder.use_dynamic_left_chunk,\n",
+    "            decoding_chunk_size, model.encoder.static_chunk_size,\n",
+    "            num_decoding_left_chunks)\n",
+    "\n",
+    "#print(chunk_masks)\n",
+    "data = np.load(\".notebook/enc_embed.npz\")\n",
+    "torch_pos_emb=data['pos_emb']\n",
+    "torch_xs = data['embed_out']\n",
+    "torch_chunk_masks = data['chunk_masks']\n",
+    "torch_mask_pad = data['mask_pad']\n",
+    "print(np.allclose(xs.numpy(), torch_xs))\n",
+    "print(np.allclose(pos_emb.numpy(), torch_pos_emb))\n",
+    "np.testing.assert_equal(chunk_masks.numpy(), torch_chunk_masks)\n",
+    "np.testing.assert_equal(mask_pad.numpy(), ~torch_mask_pad)\n",
+    "\n",
+    "for layer in model.encoder.encoders:\n",
+    "    #xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad)\n",
+    "    print(layer.feed_forward_macaron is not None)\n",
+    "    print(layer.normalize_before)\n",
+    "    \n",
+    "    data = np.load('.notebook/enc_0_norm_ff.npz')\n",
+    "    t_norm_ff = data['norm_ff']\n",
+    "    t_xs = data['xs']\n",
+    "   \n",
+    "    \n",
+    "    x = xs\n",
+    "    print(np.allclose(t_xs, x.numpy()))\n",
+    "    residual = x\n",
+    "    print(np.allclose(t_xs, residual.numpy()))\n",
+    "    x_nrom = layer.norm_ff_macaron(x)\n",
+    "    print(np.allclose(t.numpy(), x_nrom.numpy()))\n",
+    "    print(np.allclose(t_norm_ff, x_nrom.numpy()))\n",
+    "#     for n, p in layer.norm_ff_macaron.state_dict().items():\n",
+    "#         print(n, p)\n",
+    "#         pass\n",
+    "\n",
+    "    layer.eval()\n",
+    "    x_nrom = paddle.to_tensor(t_norm_ff)\n",
+    "    print(np.allclose(t_norm_ff, x_nrom.numpy()))\n",
+    "    x = residual + layer.ff_scale * layer.feed_forward_macaron(x_nrom)\n",
+    "    \n",
+    "    ps=[]\n",
+    "    for n, p in layer.feed_forward_macaron.state_dict().items():\n",
+    "         #print(n, p)\n",
+    "         ps.append(p)\n",
+    "         print(p.shape)\n",
+    "         pass\n",
+    "\n",
+    "    x_nrom = paddle.to_tensor(t_norm_ff)\n",
+    "    ff_l_x = layer.feed_forward_macaron.w_1(x_nrom)\n",
+    "    ff_l_a_x = layer.feed_forward_macaron.activation(ff_l_x)\n",
+    "    ff_l_a_l_x = layer.feed_forward_macaron.w_2(ff_l_a_x)\n",
+    "    data = np.load('.notebook/enc_0_ff_out.npz', allow_pickle=True)\n",
+    "    t_norm_ff = data['norm_ff']\n",
+    "    t_ff_out = data['ff_out']\n",
+    "    t_ff_l_x = data['ff_l_x']\n",
+    "    t_ff_l_a_x = data['ff_l_a_x']\n",
+    "    t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
+    "    t_ps = data['ps']\n",
+    "    \n",
+    "    print(\"--------ff-------\")\n",
+    "    print(np.allclose(x_nrom.numpy(), t_norm_ff))\n",
+    "    print(np.allclose(x.numpy(), t_ff_out))\n",
+    "    print(np.allclose(ff_l_x.numpy(), t_ff_l_x))\n",
+    "    print(np.allclose(ff_l_a_x.numpy(), t_ff_l_a_x))\n",
+    "    print(np.allclose(ff_l_a_l_x.numpy(), t_ff_l_a_l_x))\n",
+    "    \n",
+    "    print(np.allclose(ff_l_x.numpy(), t_ff_l_x, atol=1e-6))\n",
+    "    for p, t_p in zip(ps, t_ps):\n",
+    "        print(p.name, np.allclose(p.numpy(), t_p.T))\n",
+    "    \n",
+    "    \n",
+    "#     residual = x\n",
+    "#     x = layer.norm_mha(x)\n",
+    "#     x_q = x\n",
+    "    \n",
+    "    data = np.load('.notebook/enc_0_selattn_out.npz', allow_pickle=True)\n",
+    "    tx_q = data['x_q']\n",
+    "    tx = data['x']\n",
+    "    tpos_emb=data['pos_emb']\n",
+    "    tmask=data['mask']\n",
+    "    tt_x_att=data['x_att']\n",
+    "    x_q = paddle.to_tensor(tx_q)\n",
+    "    x = paddle.to_tensor(tx)\n",
+    "    pos_emb = paddle.to_tensor(tpos_emb)\n",
+    "    mask = paddle.to_tensor(tmask)\n",
+    "    \n",
+    "    x_att = layer.self_attn(x_q, x, x, pos_emb, mask)\n",
+    "    print(np.allclose(x_att.numpy(), t_x_att))\n",
+    "    print(np.allclose(x_att.numpy(), t_x_att, atol=1e-6))\n",
+    "    \n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 270,
+   "id": "sonic-thumb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n",
+      "True\n",
+      "False\n",
+      "True\n"
+     ]
+    }
+   ],
    "source": [
-    "\n"
+    "xs = model.encoder.global_cmvn(feat)\n",
+    "masks = make_non_pad_mask(feat_len).unsqueeze(1)\n",
+    "\n",
+    "xs, pos_emb, masks = model.encoder.embed(xs, masks.type_as(xs), offset=0)\n",
+    "masks = masks.astype(paddle.bool)\n",
+    "mask_pad = masks.logical_not()\n",
+    "decoding_chunk_size=0\n",
+    "num_decoding_left_chunks=-1\n",
+    "chunk_masks = add_optional_chunk_mask(\n",
+    "            xs, masks, model.encoder.use_dynamic_chunk, model.encoder.use_dynamic_left_chunk,\n",
+    "            decoding_chunk_size, model.encoder.static_chunk_size,\n",
+    "            num_decoding_left_chunks)\n",
+    "\n",
+    "#print(chunk_masks)\n",
+    "data = np.load(\".notebook/enc_embed.npz\")\n",
+    "torch_pos_emb=data['pos_emb']\n",
+    "torch_xs = data['embed_out']\n",
+    "torch_chunk_masks = data['chunk_masks']\n",
+    "torch_mask_pad = data['mask_pad']\n",
+    "print(np.allclose(xs.numpy(), torch_xs))\n",
+    "print(np.allclose(pos_emb.numpy(), torch_pos_emb))\n",
+    "np.testing.assert_equal(chunk_masks.numpy(), torch_chunk_masks)\n",
+    "np.testing.assert_equal(mask_pad.numpy(), ~torch_mask_pad)\n",
+    "\n",
+    "\n",
+    "for layer in model.encoder.encoders:\n",
+    "    xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad)\n",
+    "    break\n",
+    "data = np.load('.notebook/enc_0.npz')\n",
+    "torch_xs = data['enc_0']\n",
+    "print(np.allclose(xs.numpy(), torch_xs))\n",
+    "print(np.allclose(xs.numpy(), torch_xs, atol=1e-6))\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
-   "id": "handed-harris",
+   "execution_count": 273,
+   "id": "brave-latino",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n",
+      "True\n",
+      "--------layers_______\n",
+      "False\n",
+      "True\n",
+      "[[-0.70194244  0.56254214  0.6880346  ...  1.1237319   0.7803924\n",
+      "   1.1369387 ]\n",
+      " [-0.7787783   0.3912667   0.71887773 ...  1.251882    0.886168\n",
+      "   1.3173451 ]\n",
+      " [-0.95908964  0.6346029   0.87671334 ...  0.98183745  0.7440111\n",
+      "   1.2903278 ]\n",
+      " ...\n",
+      " [-1.0732255   0.67236906  0.92303115 ...  0.9075458   0.8176712\n",
+      "   1.3239655 ]\n",
+      " [-1.1654118   0.6819967   0.6939453  ...  1.2238353   0.8028295\n",
+      "   1.4506507 ]\n",
+      " [-1.2732092   0.7145806   0.75819594 ...  0.94154835  0.8774845\n",
+      "   1.2623049 ]]\n",
+      "xxxxxx\n",
+      "[[-0.7019424   0.56254166  0.6880345  ...  1.1237322   0.78039217\n",
+      "   1.1369387 ]\n",
+      " [-0.778778    0.39126638  0.7188779  ...  1.2518823   0.8861681\n",
+      "   1.3173454 ]\n",
+      " [-0.9590891   0.6346026   0.87671363 ...  0.9818373   0.74401116\n",
+      "   1.2903274 ]\n",
+      " ...\n",
+      " [-1.0732253   0.6723689   0.9230311  ...  0.9075457   0.8176713\n",
+      "   1.3239657 ]\n",
+      " [-1.165412    0.6819976   0.69394535 ...  1.2238353   0.80282927\n",
+      "   1.4506509 ]\n",
+      " [-1.273209    0.71458095  0.75819623 ...  0.9415484   0.8774842\n",
+      "   1.2623055 ]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "xs = model.encoder.global_cmvn(feat)\n",
+    "masks = make_non_pad_mask(feat_len).unsqueeze(1)\n",
+    "\n",
+    "xs, pos_emb, masks = model.encoder.embed(xs, masks.type_as(xs), offset=0)\n",
+    "masks = masks.astype(paddle.bool)\n",
+    "mask_pad = masks.logical_not()\n",
+    "decoding_chunk_size=0\n",
+    "num_decoding_left_chunks=-1\n",
+    "chunk_masks = add_optional_chunk_mask(\n",
+    "            xs, masks, model.encoder.use_dynamic_chunk, model.encoder.use_dynamic_left_chunk,\n",
+    "            decoding_chunk_size, model.encoder.static_chunk_size,\n",
+    "            num_decoding_left_chunks)\n",
+    "\n",
+    "#print(chunk_masks)\n",
+    "data = np.load(\".notebook/enc_embed.npz\")\n",
+    "torch_pos_emb=data['pos_emb']\n",
+    "torch_xs = data['embed_out']\n",
+    "torch_chunk_masks = data['chunk_masks']\n",
+    "torch_mask_pad = data['mask_pad']\n",
+    "print(np.allclose(xs.numpy(), torch_xs))\n",
+    "print(np.allclose(pos_emb.numpy(), torch_pos_emb))\n",
+    "np.testing.assert_equal(chunk_masks.numpy(), torch_chunk_masks)\n",
+    "np.testing.assert_equal(mask_pad.numpy(), ~torch_mask_pad)\n",
+    "\n",
+    "print(\"--------layers_______\")\n",
+    "i =0\n",
+    "for layer in model.encoder.encoders:\n",
+    "    xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad)\n",
+    "    i+=1\n",
+    "#     if i == 2:\n",
+    "#         data = np.load('.notebook/enc_2.npz')\n",
+    "#         torch_xs = data['enc_2']\n",
+    "#         print(np.allclose(xs.numpy(), torch_xs))\n",
+    "#         print(np.allclose(xs.numpy(), torch_xs, atol=1e-5))\n",
+    "#         print(xs[0].numpy())\n",
+    "#         print('xxxxxx')\n",
+    "#         print(torch_xs[0])\n",
+    "#         print('----i==2')\n",
+    "data = np.load('.notebook/enc_all.npz')\n",
+    "torch_xs = data['enc_all']\n",
+    "print(np.allclose(xs.numpy(), torch_xs))\n",
+    "print(np.allclose(xs.numpy(), torch_xs, atol=1e-5))\n",
+    "print(xs[0].numpy())\n",
+    "print('xxxxxx')\n",
+    "print(torch_xs[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "id": "municipal-stock",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 278,
+   "id": "macro-season",
    "metadata": {},
    "outputs": [
     {
-     "ename": "SystemError",
-     "evalue": "(Fatal) Operator elementwise_sub raises an paddle::memory::allocation::BadAlloc exception.\nThe exception content is\n:ResourceExhaustedError: \n\nOut of memory error on GPU 0. Cannot allocate 1.010986MB memory on GPU 0, available memory is only 6.437500MB.\n\nPlease check whether there is any other process using GPU 0.\n1. If yes, please stop them, or start PaddlePaddle on another GPU.\n2. If no, please decrease the batch size of your model. \n\n (at /paddle/paddle/fluid/memory/allocation/cuda_allocator.cc:69)\n. (at /paddle/paddle/fluid/imperative/tracer.cc:172)\n",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mSystemError\u001b[0m                               Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-43-fb4fc80a6da8>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mencoder_out\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoder_mask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoder\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeat_len\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mencoder_out\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mencoder_mask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mencoder_out\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch_encoder_out\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dygraph/layers.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m    900\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_built\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 902\u001b[0;31m             \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    903\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    904\u001b[0m             \u001b[0;32mfor\u001b[0m \u001b[0mforward_post_hook\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_forward_post_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/workspace/DeepSpeech-2.x/deepspeech/modules/encoder.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, xs, xs_lens, decoding_chunk_size, num_decoding_left_chunks)\u001b[0m\n\u001b[1;32m    158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    159\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mglobal_cmvn\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 160\u001b[0;31m             \u001b[0mxs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mglobal_cmvn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    161\u001b[0m         \u001b[0;31m#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    162\u001b[0m         \u001b[0mxs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpos_emb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmasks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0membed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmasks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype_as\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moffset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dygraph/layers.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m    900\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_built\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 902\u001b[0;31m             \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    903\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    904\u001b[0m             \u001b[0;32mfor\u001b[0m \u001b[0mforward_post_hook\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_forward_post_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/workspace/DeepSpeech-2.x/deepspeech/modules/cmvn.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m     46\u001b[0m             \u001b[0;34m(\u001b[0m\u001b[0mpaddle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTensor\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnormalized\u001b[0m \u001b[0mfeature\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     47\u001b[0m         \"\"\"\n\u001b[0;32m---> 48\u001b[0;31m         \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     49\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnorm_var\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     50\u001b[0m             \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mistd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dygraph/math_op_patch.py\u001b[0m in \u001b[0;36m__impl__\u001b[0;34m(self, other_var)\u001b[0m\n\u001b[1;32m    247\u001b[0m             \u001b[0maxis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    248\u001b[0m             \u001b[0mmath_op\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mops\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 249\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mmath_op\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother_var\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'axis'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    250\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    251\u001b[0m         \u001b[0mcomment\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mOpProtoHolder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_op_proto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mop_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcomment\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mSystemError\u001b[0m: (Fatal) Operator elementwise_sub raises an paddle::memory::allocation::BadAlloc exception.\nThe exception content is\n:ResourceExhaustedError: \n\nOut of memory error on GPU 0. Cannot allocate 1.010986MB memory on GPU 0, available memory is only 6.437500MB.\n\nPlease check whether there is any other process using GPU 0.\n1. If yes, please stop them, or start PaddlePaddle on another GPU.\n2. If no, please decrease the batch size of your model. \n\n (at /paddle/paddle/fluid/memory/allocation/cuda_allocator.cc:69)\n. (at /paddle/paddle/fluid/imperative/tracer.cc:172)\n"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[-0.7019424   0.5625421   0.68803453 ...  1.1237317   0.7803923\n",
+      "   1.1369386 ]\n",
+      " [-0.7787783   0.39126673  0.71887773 ...  1.251882    0.886168\n",
+      "   1.3173451 ]\n",
+      " [-0.95908964  0.6346029   0.87671334 ...  0.98183745  0.7440111\n",
+      "   1.2903278 ]\n",
+      " ...\n",
+      " [-1.0732255   0.67236906  0.92303115 ...  0.9075458   0.8176712\n",
+      "   1.3239655 ]\n",
+      " [-1.1654117   0.68199664  0.6939452  ...  1.2238352   0.8028294\n",
+      "   1.4506506 ]\n",
+      " [-1.2732091   0.71458054  0.7581958  ...  0.9415482   0.8774844\n",
+      "   1.2623048 ]]\n",
+      "---\n",
+      "[[-0.7019424   0.56254166  0.6880345  ...  1.1237322   0.78039217\n",
+      "   1.1369387 ]\n",
+      " [-0.778778    0.39126638  0.7188779  ...  1.2518823   0.8861681\n",
+      "   1.3173454 ]\n",
+      " [-0.9590891   0.6346026   0.87671363 ...  0.9818373   0.74401116\n",
+      "   1.2903274 ]\n",
+      " ...\n",
+      " [-1.0732253   0.6723689   0.9230311  ...  0.9075457   0.8176713\n",
+      "   1.3239657 ]\n",
+      " [-1.165412    0.6819976   0.69394535 ...  1.2238353   0.80282927\n",
+      "   1.4506509 ]\n",
+      " [-1.2732087   0.71458083  0.7581961  ...  0.9415482   0.877484\n",
+      "   1.2623053 ]]\n",
+      "False\n",
+      "True\n",
+      "False\n"
      ]
     }
    ],
    "source": [
-    "encoder_out, encoder_mask = model.encoder(feat, feat_len)\n",
-    "print(encoder_out.shape)\n",
-    "print(encoder_mask.shape)\n",
-    "print(encoder_out[0])\n",
-    "print(torch_encoder_out[0])"
+    "encoder_out, mask = model.encoder(feat, feat_len)\n",
+    "print(encoder_out.numpy()[0])\n",
+    "print(\"---\")\n",
+    "print(torch_encoder_out[0])\n",
+    "print(np.allclose(torch_encoder_out, encoder_out.numpy()))\n",
+    "print(np.allclose(torch_encoder_out, encoder_out.numpy(), atol=1e-5))\n",
+    "print(np.allclose(torch_encoder_out, encoder_out.numpy(), atol=1e-6))"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "sonic-thumb",
+   "id": "associate-sampling",
    "metadata": {},
    "outputs": [],
    "source": []