diff --git a/.notebook/Linear_test.ipynb b/.notebook/Linear_test.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a11da386fd0ceeb17ca5d12c50c9505c64a52dc3 --- /dev/null +++ b/.notebook/Linear_test.ipynb @@ -0,0 +1,375 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "academic-surname", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " def convert_to_list(value, n, name, dtype=np.int):\n" + ] + } + ], + "source": [ + "import paddle\n", + "from paddle import nn" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fundamental-treasure", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", + " and should_run_async(code)\n" + ] + } + ], + "source": [ + "L = nn.Linear(256, 2048)\n", + "L2 = nn.Linear(2048, 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "consolidated-elephant", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import torch\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "moderate-noise", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "float64\n", + "Tensor(shape=[2, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n", + " [[[-0.03137276, 0.75036579, -0.62955737, ..., -0.39516482, 2.41965628, 0.19466873],\n", + " [ 0.55916852, 1.13357353, 0.28754908, ..., 0.28860641, 0.48257691, -1.07664418],\n", + " [-0.27433595, -0.05911482, 0.04942252, ..., 0.46596146, 1.24395037, -1.98374581],\n", + " ...,\n", + " [-0.45322138, 0.51459873, 0.28475651, ..., -0.90797561, -0.80436397, -2.30388594],\n", + " [ 0.20310247, 1.90435207, -1.02483511, ..., -1.59850407, -0.30733466, 0.49769276],\n", + " [-2.63085651, -0.52244109, 0.32019949, ..., 1.10662329, -0.55995786, -0.36770794]],\n", + "\n", + " [[-1.78831303, 2.24759626, 0.41386250, ..., -0.30020580, -0.16084948, 0.93251175],\n", + " [ 0.03264519, -0.92942363, 1.58523536, ..., 1.23681784, -0.94711000, 0.63553023],\n", + " [-0.19725564, -2.38587499, -0.29334834, ..., 0.83498263, -0.58492625, 0.58732986],\n", + " ...,\n", + " [-0.61646742, -1.02978027, 0.45410269, ..., 0.87052751, -0.20801133, 2.17943859],\n", + " [-0.67230755, -0.79410625, -0.13054833, ..., -1.18138039, -0.47578079, -0.22610545],\n", + " [ 2.57333422, 0.63872230, 0.70852041, ..., -0.44040251, -0.33339104, -0.24722832]]])\n", + "tensor([[[-0.0314, 0.7504, -0.6296, ..., -0.3952, 2.4197, 0.1947],\n", + " [ 0.5592, 1.1336, 0.2875, ..., 0.2886, 0.4826, -1.0766],\n", + " [-0.2743, -0.0591, 0.0494, ..., 0.4660, 1.2440, -1.9837],\n", + " ...,\n", + " [-0.4532, 0.5146, 0.2848, ..., -0.9080, -0.8044, -2.3039],\n", + " [ 0.2031, 1.9044, -1.0248, ..., -1.5985, -0.3073, 0.4977],\n", + " [-2.6309, -0.5224, 0.3202, ..., 1.1066, -0.5600, -0.3677]],\n", + "\n", + " [[-1.7883, 2.2476, 0.4139, ..., -0.3002, -0.1608, 0.9325],\n", + " [ 0.0326, -0.9294, 1.5852, ..., 1.2368, -0.9471, 0.6355],\n", + " [-0.1973, -2.3859, -0.2933, ..., 0.8350, -0.5849, 0.5873],\n", + " ...,\n", + " [-0.6165, -1.0298, 0.4541, ..., 0.8705, -0.2080, 2.1794],\n", + " [-0.6723, -0.7941, -0.1305, ..., -1.1814, -0.4758, -0.2261],\n", + " [ 2.5733, 0.6387, 0.7085, ..., -0.4404, -0.3334, -0.2472]]])\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", + " and should_run_async(code)\n" + ] + } + ], + "source": [ + "x = np.random.randn(2, 51, 256)\n", + "print(x.dtype)\n", + "px = paddle.to_tensor(x, dtype='float32')\n", + "tx = torch.tensor(x, dtype=torch.float32)\n", + "print(px)\n", + "print(tx)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cooked-progressive", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "mechanical-prisoner", + "metadata": {}, + "outputs": [], + "source": [ + "data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n", + "t_norm_ff = data['norm_ff']\n", + "t_ff_out = data['ff_out']\n", + "t_ff_l_x = data['ff_l_x']\n", + "t_ff_l_a_x = data['ff_l_a_x']\n", + "t_ff_l_a_l_x = data['ff_l_a_l_x']\n", + "t_ps = data['ps']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "indie-marriage", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "assured-zambia", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "True\n", + "True\n", + "True\n" + ] + } + ], + "source": [ + "L.set_state_dict({'weight': t_ps[0].T, 'bias': t_ps[1]})\n", + "L2.set_state_dict({'weight': t_ps[2].T, 'bias': t_ps[3]})\n", + "\n", + "ps = []\n", + "for n, p in L.named_parameters():\n", + " ps.append(p)\n", + "\n", + "for n, p in L2.state_dict().items():\n", + " ps.append(p)\n", + " \n", + "for p, tp in zip(ps, t_ps):\n", + " print(np.allclose(p.numpy(), tp.T))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "committed-jacob", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "extreme-traffic", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "optimum-milwaukee", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "viral-indian", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "True\n", + "True\n", + "True\n" + ] + } + ], + "source": [ + "# data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n", + "# t_norm_ff = data['norm_ff']\n", + "# t_ff_out = data['ff_out']\n", + "# t_ff_l_x = data['ff_l_x']\n", + "# t_ff_l_a_x = data['ff_l_a_x']\n", + "# t_ff_l_a_l_x = data['ff_l_a_l_x']\n", + "# t_ps = data['ps']\n", + "TL = torch.nn.Linear(256, 2048)\n", + "TL2 = torch.nn.Linear(2048, 256)\n", + "TL.load_state_dict({'weight': torch.tensor(t_ps[0]), 'bias': torch.tensor(t_ps[1])})\n", + "TL2.load_state_dict({'weight': torch.tensor(t_ps[2]), 'bias': torch.tensor(t_ps[3])})\n", + "\n", + "# for n, p in TL.named_parameters():\n", + "# print(n, p)\n", + "# for n, p in TL2.named_parameters():\n", + "# print(n, p)\n", + "\n", + "ps = []\n", + "for n, p in TL.state_dict().items():\n", + " ps.append(p.data.numpy())\n", + " \n", + "for n, p in TL2.state_dict().items():\n", + " ps.append(p.data.numpy())\n", + " \n", + "for p, tp in zip(ps, t_ps):\n", + " print(np.allclose(p, tp))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "skilled-vietnamese", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[-0.25528666 -0.9090747 0.12996 ... 0.02552819 0.37376517\n", + " -0.558986 ]\n", + " [-0.45657372 0.23811203 0.33472425 ... 1.0797666 -0.7263612\n", + " 0.31549692]]\n", + "[[-0.25528657 -0.9090746 0.12996009 ... 0.02552832 0.37376505\n", + " -0.5589858 ]\n", + " [-0.45657367 0.23811209 0.33472428 ... 1.0797666 -0.7263612\n", + " 0.31549698]]\n", + "True\n", + "False\n" + ] + } + ], + "source": [ + "y = L(px)\n", + "print(y.numpy())\n", + "\n", + "ty = TL(tx)\n", + "print(ty.data.numpy())\n", + "print(np.allclose(px.numpy(), tx.detach().numpy()))\n", + "print(np.allclose(y.numpy(), ty.detach().numpy()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "incorrect-allah", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "prostate-cameroon", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "governmental-surge", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0.07453135 0.0698561 0.6273111 ... 0.5845924 -0.65527105\n", + " 0.5881643 ]\n", + " [ 0.3902049 -0.17455879 -1.1802813 ... -0.36912322 0.55681896\n", + " -0.11917676]]\n", + "[[ 0.07453132 0.06985616 0.62731117 ... 0.5845925 -0.65527105\n", + " 0.5881642 ]\n", + " [ 0.39020485 -0.17455864 -1.1802814 ... -0.3691232 0.556819\n", + " -0.11917675]]\n", + "True\n", + "False\n", + "True\n" + ] + } + ], + "source": [ + "x = np.random.randn(2, 256)\n", + "px = paddle.to_tensor(x, dtype='float32')\n", + "tx = torch.tensor(x, dtype=torch.float32)\n", + "y = L(px)\n", + "print(y.numpy())\n", + "ty = TL(tx)\n", + "print(ty.data.numpy())\n", + "print(np.allclose(px.numpy(), tx.detach().numpy()))\n", + "print(np.allclose(y.numpy(), ty.detach().numpy()))\n", + "print(np.allclose(y.numpy(), ty.detach().numpy(), atol=1e-5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "confidential-jacket", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "improved-civilization", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/.notebook/layer_norm_test.ipynb b/.notebook/layer_norm_test.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..eac3566ff0590295a1f3b742cd8d038f420500ce --- /dev/null +++ b/.notebook/layer_norm_test.ipynb @@ -0,0 +1,229 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 32, + "id": "academic-surname", + "metadata": {}, + "outputs": [], + "source": [ + "import paddle\n", + "from paddle import nn" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "fundamental-treasure", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parameter containing:\n", + "Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", + " [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])\n", + "Parameter containing:\n", + "Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])\n" + ] + } + ], + "source": [ + "L = nn.LayerNorm(256, epsilon=1e-12)\n", + "for p in L.parameters():\n", + " print(p)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "consolidated-elephant", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "moderate-noise", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "float64\n" + ] + } + ], + "source": [ + "x = np.random.randn(2, 51, 256)\n", + "print(x.dtype)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "cooked-progressive", + "metadata": {}, + "outputs": [], + "source": [ + "y = L(paddle.to_tensor(x, dtype='float32'))" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "optimum-milwaukee", + "metadata": {}, + "outputs": [], + "source": [ + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "viral-indian", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parameter containing:\n", + "tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1.], requires_grad=True)\n", + "Parameter containing:\n", + "tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", + " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " requires_grad=True)\n" + ] + } + ], + "source": [ + "TL = torch.nn.LayerNorm(256, eps=1e-12)\n", + "for p in TL.parameters():\n", + " print(p)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "skilled-vietnamese", + "metadata": {}, + "outputs": [], + "source": [ + "ty = TL(torch.tensor(x, dtype=torch.float32))" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "incorrect-allah", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.allclose(y.numpy(), ty.detach().numpy())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "prostate-cameroon", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "governmental-surge", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x = np.random.randn(2, 256)\n", + "y = L(paddle.to_tensor(x, dtype='float32'))\n", + "ty = TL(torch.tensor(x, dtype=torch.float32))\n", + "np.allclose(y.numpy(), ty.detach().numpy())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "confidential-jacket", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/.notebook/position_embeding_check.ipynb b/.notebook/position_embeding_check.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f7410c256b69dabfdaaebb2ff6fa8e45aa83ec31 --- /dev/null +++ b/.notebook/position_embeding_check.ipynb @@ -0,0 +1,160 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 29, + "id": "designing-borough", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n", + " 0.0000000e+00 0.0000000e+00]\n", + " [ 8.4147096e-01 8.0196178e-01 7.6172036e-01 ... 1.2409373e-04\n", + " 1.1547816e-04 1.0746076e-04]\n", + " [ 9.0929741e-01 9.5814437e-01 9.8704624e-01 ... 2.4818745e-04\n", + " 2.3095631e-04 2.1492151e-04]\n", + " ...\n", + " [ 3.7960774e-01 7.4510968e-01 7.3418564e-01 ... 1.2036801e-02\n", + " 1.1201146e-02 1.0423505e-02]\n", + " [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ... 1.2160885e-02\n", + " 1.1316618e-02 1.0530960e-02]\n", + " [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ... 1.2284970e-02\n", + " 1.1432089e-02 1.0638415e-02]]\n", + "True\n", + "True\n" + ] + } + ], + "source": [ + "import torch\n", + "import math\n", + "import numpy as np\n", + "\n", + "max_len=100\n", + "d_model=256\n", + "\n", + "pe = torch.zeros(max_len, d_model)\n", + "position = torch.arange(0, max_len,\n", + " dtype=torch.float32).unsqueeze(1)\n", + "toruch_position = position\n", + "div_term = torch.exp(\n", + " torch.arange(0, d_model, 2, dtype=torch.float32) *\n", + " -(math.log(10000.0) / d_model))\n", + "tourch_div_term = div_term.cpu().detach().numpy()\n", + "\n", + "\n", + "\n", + "torhc_sin = torch.sin(position * div_term)\n", + "torhc_cos = torch.cos(position * div_term)\n", + "print(torhc_sin.cpu().detach().numpy())\n", + "np_sin = np.sin((position * div_term).cpu().detach().numpy())\n", + "np_cos = np.cos((position * div_term).cpu().detach().numpy())\n", + "print(np.allclose(np_sin, torhc_sin.cpu().detach().numpy()))\n", + "print(np.allclose(np_cos, torhc_cos.cpu().detach().numpy()))\n", + "pe[:, 0::2] = torhc_sin\n", + "pe[:, 1::2] = torhc_cos\n", + "tourch_pe = pe.cpu().detach().numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "swiss-referral", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "True\n", + "False\n", + "False\n", + "False\n", + "False\n" + ] + } + ], + "source": [ + "import paddle\n", + "ppe = paddle.zeros([max_len, d_model])\n", + "position = paddle.arange(0, max_len,\n", + " dtype='float32').unsqueeze(1)\n", + "print(np.allclose(position.numpy(), toruch_position))\n", + "div_term = paddle.exp(\n", + " paddle.arange(0, d_model, 2, dtype='float32') *\n", + " -(math.log(10000.0) / d_model))\n", + "print(np.allclose(div_term.numpy(), tourch_div_term))\n", + "\n", + "\n", + "\n", + "p_sin = paddle.sin(position * div_term)\n", + "p_cos = paddle.cos(position * div_term)\n", + "print(np.allclose(np_sin, p_sin.numpy(), rtol=1.e-6, atol=0))\n", + "print(np.allclose(np_cos, p_cos.numpy(), rtol=1.e-6, atol=0))\n", + "ppe[:, 0::2] = p_sin\n", + "ppe[:, 1::2] = p_cos\n", + "print(np.allclose(p_sin.numpy(), torhc_sin.cpu().detach().numpy()))\n", + "print(np.allclose(p_cos.numpy(), torhc_cos.cpu().detach().numpy()))" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "integrated-boards", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "False\n" + ] + } + ], + "source": [ + "print(np.allclose(ppe.numpy(), pe.numpy()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "flying-reserve", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "revised-divide", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/.notebook/u2_model.ipynb b/.notebook/u2_model.ipynb index f9e7c1eecd879b75f231dba3fed860879361c8ed..4f2c9632f4634e83924515925aafd2c52c20a270 100644 --- a/.notebook/u2_model.ipynb +++ b/.notebook/u2_model.ipynb @@ -100,7 +100,7 @@ "text": [ "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", " and should_run_async(code)\n", - "[INFO 2021/04/19 06:57:01 u2.py:834] U2 Encoder type: conformer\n" + "[INFO 2021/04/20 03:32:21 u2.py:834] U2 Encoder type: conformer\n" ] }, { @@ -1439,13 +1439,7 @@ "decoder.decoders.3.feed_forward.w_2.weight | [2048, 256] | 524288\n", "decoder.decoders.3.feed_forward.w_2.bias | [256] | 256\n", "decoder.decoders.3.norm1.weight | [256] | 256\n", - "decoder.decoders.3.norm1.bias | [256] | 256\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "decoder.decoders.3.norm1.bias | [256] | 256\n", "decoder.decoders.3.norm2.weight | [256] | 256\n", "decoder.decoders.3.norm2.bias | [256] | 256\n", "decoder.decoders.3.norm3.weight | [256] | 256\n", @@ -1526,7 +1520,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "ruled-invitation", "metadata": {}, "outputs": [ @@ -2184,6 +2178,16 @@ "print(model)" ] }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fossil-means", + "metadata": {}, + "outputs": [], + "source": [ + "# load feat" + ] + }, { "cell_type": "code", "execution_count": 7, @@ -2194,13 +2198,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "compute_cmvn_loader_test.ipynb jit_infer.ipynb\r\n", - "dataloader.ipynb mask_and_masked_fill_test.ipynb\r\n", - "dataloader_with_tokens_tokenids.ipynb model.npz\r\n", - "data.npz python_test.ipynb\r\n", - "decoder.npz train_test.ipynb\r\n", - "encoder.npz u2_model.ipynb\r\n", - "hack_api_test.ipynb\r\n" + "compute_cmvn_loader_test.ipynb encoder.npz\r\n", + "dataloader.ipynb hack_api_test.ipynb\r\n", + "dataloader_with_tokens_tokenids.ipynb jit_infer.ipynb\r\n", + "data.npz layer_norm_test.ipynb\r\n", + "decoder.npz Linear_test.ipynb\r\n", + "enc_0_ff_out.npz mask_and_masked_fill_test.ipynb\r\n", + "enc_0_norm_ff.npz model.npz\r\n", + "enc_0.npz position_embeding_check.ipynb\r\n", + "enc_0_selattn_out.npz python_test.ipynb\r\n", + "enc_2.npz train_test.ipynb\r\n", + "enc_all.npz u2_model.ipynb\r\n", + "enc_embed.npz\r\n" ] } ], @@ -2213,21 +2222,6 @@ "execution_count": 8, "id": "abroad-oracle", "metadata": {}, - "outputs": [], - "source": [ - "data = np.load('.notebook/data.npz', allow_pickle=True)\n", - "keys=data['keys']\n", - "feat=data['feat']\n", - "feat_len=data['feat_len']\n", - "text=data['text']\n", - "text_len=data['text_len']" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "false-instrument", - "metadata": {}, "outputs": [ { "name": "stdout", @@ -2311,6 +2305,12 @@ } ], "source": [ + "data = np.load('.notebook/data.npz', allow_pickle=True)\n", + "keys=data['keys']\n", + "feat=data['feat']\n", + "feat_len=data['feat_len']\n", + "text=data['text']\n", + "text_len=data['text_len']\n", "print(keys)\n", "print(feat.shape)\n", "print(feat)\n", @@ -2321,7 +2321,15 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, + "id": "false-instrument", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 9, "id": "arctic-proxy", "metadata": {}, "outputs": [], @@ -2400,7 +2408,15 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, + "id": "seasonal-switch", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 10, "id": "defined-brooks", "metadata": {}, "outputs": [ @@ -2408,17 +2424,23 @@ "name": "stdout", "output_type": "stream", "text": [ - "compute_cmvn_loader_test.ipynb\t jit_infer.ipynb\r\n", - "dataloader.ipynb\t\t mask_and_masked_fill_test.ipynb\r\n", - "dataloader_with_tokens_tokenids.ipynb model.npz\r\n", - "data.npz\t\t\t python_test.ipynb\r\n", - "decoder.npz\t\t\t train_test.ipynb\r\n", - "encoder.npz\t\t\t u2_model.ipynb\r\n", - "hack_api_test.ipynb\r\n" + "compute_cmvn_loader_test.ipynb\t encoder.npz\r\n", + "dataloader.ipynb\t\t hack_api_test.ipynb\r\n", + "dataloader_with_tokens_tokenids.ipynb jit_infer.ipynb\r\n", + "data.npz\t\t\t layer_norm_test.ipynb\r\n", + "decoder.npz\t\t\t Linear_test.ipynb\r\n", + "enc_0_ff_out.npz\t\t mask_and_masked_fill_test.ipynb\r\n", + "enc_0_norm_ff.npz\t\t model.npz\r\n", + "enc_0.npz\t\t\t position_embeding_check.ipynb\r\n", + "enc_0_selattn_out.npz\t\t python_test.ipynb\r\n", + "enc_2.npz\t\t\t train_test.ipynb\r\n", + "enc_all.npz\t\t\t u2_model.ipynb\r\n", + "enc_embed.npz\r\n" ] } ], "source": [ + "# load model param\n", "!ls .notebook\n", "data = np.load('.notebook/model.npz', allow_pickle=True)\n", "state_dict = data['state'].item()\n", @@ -2445,7 +2467,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "id": "confident-piano", "metadata": {}, "outputs": [ @@ -2478,6 +2500,7 @@ } ], "source": [ + "# compute loss\n", "import paddle\n", "feat=paddle.to_tensor(feat)\n", "feat_len=paddle.to_tensor(feat_len, dtype='int64')\n", @@ -2492,12 +2515,15 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "better-senator", "metadata": {}, "outputs": [], "source": [ - "# tensor(142.4858, device='cuda:0', grad_fn=) tensor(41.8416, device='cuda:0', grad_fn=) tensor(377.3222, device='cuda:0', grad_fn=)" + "# tensor(142.4888, device='cuda:0', grad_fn=) \n", + "# tensor(41.8415, device='cuda:0', grad_fn=) \n", + "# tensor(377.3326, device='cuda:0', grad_fn=)\n", + "# 142.4888 41.84146 377.33258" ] }, { @@ -2510,7 +2536,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "olympic-problem", "metadata": {}, "outputs": [ @@ -2532,23 +2558,16 @@ } ], "source": [ + "# ecnoder\n", "encoder_out, encoder_mask = model.encoder(feat, feat_len)\n", "print(encoder_out.shape)\n", "print(encoder_mask.shape)\n", - "print(encoder_out[0])\n" + "print(encoder_out[0])" ] }, { "cell_type": "code", - "execution_count": null, - "id": "cubic-values", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "id": "shaped-alaska", "metadata": {}, "outputs": [ @@ -2571,7 +2590,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "id": "federal-rover", "metadata": {}, "outputs": [ @@ -2589,7 +2608,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "id": "regulated-interstate", "metadata": {}, "outputs": [ @@ -2610,18 +2629,38 @@ " [-1.165412 0.6819976 0.69394535 ... 1.2238353 0.80282927\n", " 1.4506509 ]\n", " [-1.2732087 0.71458083 0.7581961 ... 0.9415482 0.877484\n", - " 1.2623053 ]]\n" + " 1.2623053 ]]\n", + "----\n", + "[[-0.7019418 0.56254166 0.6880346 ... 1.1237322 0.78039235\n", + " 1.1369387 ]\n", + " [-0.7787781 0.39126658 0.71887815 ... 1.2518822 0.8861679\n", + " 1.3173453 ]\n", + " [-0.95908946 0.6346025 0.87671334 ... 0.9818373 0.7440108\n", + " 1.2903266 ]\n", + " ...\n", + " [-1.073225 0.67236906 0.9230311 ... 0.9075456 0.81767166\n", + " 1.3239657 ]\n", + " [-1.1654116 0.68199694 0.69394493 ... 1.2238349 0.8028289\n", + " 1.4506508 ]\n", + " [-1.2732095 0.7145803 0.7581956 ... 0.9415491 0.87748396\n", + " 1.2623051 ]]\n", + "True\n", + "False\n" ] } ], "source": [ "print(np.allclose(torch_encoder_out, encoder_out.numpy()))\n", - "print(torch_encoder_out[0])" + "print(torch_encoder_out[0])\n", + "print(\"----\")\n", + "print(encoder_out.numpy()[0])\n", + "print(np.allclose(torch_encoder_out, encoder_out.numpy(), atol=1e-5, rtol=1e-6))\n", + "print(np.allclose(torch_encoder_out, encoder_out.numpy(), atol=1e-6, rtol=1e-6))" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "id": "proof-scheduling", "metadata": {}, "outputs": [ @@ -2630,23 +2669,23 @@ "output_type": "stream", "text": [ "Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", - " [377.32220459])\n", + " [377.33258057])\n", "[1.]\n", - "[[ 3.1708076e+00 -1.5184805e-02 4.9524564e-02 ... -2.4678309e-03\n", - " -5.9236852e-03 -7.2192554e-03]\n", - " [-1.7474542e+00 7.7654729e-03 -4.5106117e-02 ... 9.8463835e-04\n", - " 2.4569160e-03 2.2863639e-03]\n", - " [-2.3707268e+00 1.3136451e-02 -2.6281785e-02 ... 2.2738585e-03\n", - " 5.7726162e-03 7.4628354e-03]\n", + "[[ 3.16902876e+00 -1.51763987e-02 4.91095744e-02 ... -2.47971853e-03\n", + " -5.93360700e-03 -7.26609165e-03]\n", + " [-1.74184477e+00 7.75874173e-03 -4.49434854e-02 ... 9.92412097e-04\n", + " 2.46337592e-03 2.31892057e-03]\n", + " [-2.33343339e+00 1.30475955e-02 -2.66557075e-02 ... 2.27532350e-03\n", + " 5.76924905e-03 7.48788286e-03]\n", " ...\n", - " [-4.4350743e+00 2.4916438e-02 -9.0385124e-02 ... 4.4534383e-03\n", - " 1.1696636e-02 1.4515720e-02]\n", - " [-3.3899918e+00 1.7287316e-02 -6.3514955e-02 ... 3.2612216e-03\n", - " 8.5411733e-03 1.0692922e-02]\n", - " [-6.6964636e+00 3.5097409e-02 -1.2437013e-01 ... 6.3515711e-03\n", - " 1.6078018e-02 2.0318989e-02]]\n", - "[-4.4341431e+00 2.3347888e-02 -9.3501516e-02 ... 4.2512305e-03\n", - " 1.0928102e-02 1.3750527e-02]\n" + " [-4.30358458e+00 2.46054661e-02 -9.00950655e-02 ... 4.43156436e-03\n", + " 1.16122244e-02 1.44715561e-02]\n", + " [-3.36921120e+00 1.73153952e-02 -6.36872873e-02 ... 3.28363618e-03\n", + " 8.58010259e-03 1.07794888e-02]\n", + " [-6.62045336e+00 3.49955931e-02 -1.23962618e-01 ... 6.36671018e-03\n", + " 1.60814095e-02 2.03891303e-02]]\n", + "[-4.3777819e+00 2.3245810e-02 -9.3339294e-02 ... 4.2569344e-03\n", + " 1.0919910e-02 1.3787797e-02]\n" ] } ], @@ -2679,23 +2718,25 @@ "print(loss_ctc.grad)\n", "print(model.ctc.ctc_lo.weight.grad)\n", "print(model.ctc.ctc_lo.bias.grad)\n", - "# tensor(377.3222, device='cuda:0', grad_fn=)\n", + "\n", + "\n", + "# tensor(377.3326, device='cuda:0', grad_fn=)\n", "# None\n", - "# tensor([[ 3.1708e+00, -1.7475e+00, -2.3708e+00, ..., -4.4351e+00,\n", - "# -3.3900e+00, -6.6965e+00],\n", - "# [-1.5185e-02, 7.7655e-03, 1.3137e-02, ..., 2.4917e-02,\n", - "# 1.7287e-02, 3.5098e-02],\n", - "# [ 4.9522e-02, -4.5104e-02, -2.6280e-02, ..., -9.0381e-02,\n", - "# -6.3512e-02, -1.2436e-01],\n", - "# ...,\n", - "# [-2.4678e-03, 9.8464e-04, 2.2739e-03, ..., 4.4535e-03,\n", - "# 3.2612e-03, 6.3516e-03],\n", - "# [-5.9237e-03, 2.4569e-03, 5.7726e-03, ..., 1.1697e-02,\n", - "# 8.5412e-03, 1.6078e-02],\n", - "# [-7.2193e-03, 2.2864e-03, 7.4629e-03, ..., 1.4516e-02,\n", - "# 1.0693e-02, 2.0319e-02]], device='cuda:0')\n", - "# tensor([-4.4342e+00, 2.3348e-02, -9.3497e-02, ..., 4.2513e-03,\n", - "# 1.0928e-02, 1.3751e-02], device='cuda:0')" + "# [[ 3.16902351e+00 -1.51765049e-02 4.91097234e-02 ... -2.47973716e-03\n", + "# -5.93366381e-03 -7.26613170e-03]\n", + "# [-1.74185038e+00 7.75875803e-03 -4.49435972e-02 ... 9.92415240e-04\n", + "# 2.46338220e-03 2.31891591e-03]\n", + "# [-2.33343077e+00 1.30476682e-02 -2.66557615e-02 ... 2.27533933e-03\n", + "# 5.76929189e-03 7.48792710e-03]\n", + "# ...\n", + "# [-4.30356789e+00 2.46056803e-02 -9.00955945e-02 ... 4.43160534e-03\n", + "# 1.16123557e-02 1.44716976e-02]\n", + "# [-3.36919212e+00 1.73155665e-02 -6.36875406e-02 ... 3.28367390e-03\n", + "# 8.58021621e-03 1.07796099e-02]\n", + "# [-6.62039661e+00 3.49958315e-02 -1.23963736e-01 ... 6.36674836e-03\n", + "# 1.60815325e-02 2.03892551e-02]]\n", + "# [-4.3777566e+00 2.3245990e-02 -9.3339972e-02 ... 4.2569702e-03\n", + "# 1.0920014e-02 1.3787906e-02]" ] }, { @@ -2708,7 +2749,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "id": "synthetic-hungarian", "metadata": {}, "outputs": [ @@ -2717,7 +2758,7 @@ "output_type": "stream", "text": [ "Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", - " [41.84160995]) 0.0\n" + " [41.84146118]) 0.0\n" ] } ], @@ -2730,17 +2771,15 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "id": "indian-sweden", "metadata": {}, "outputs": [], - "source": [ - "# encoder, decoder不对齐" - ] + "source": [] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 202, "id": "marine-cuisine", "metadata": {}, "outputs": [ @@ -2772,7 +2811,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 180, "id": "several-result", "metadata": {}, "outputs": [], @@ -2833,7 +2872,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 181, "id": "possible-bulgaria", "metadata": {}, "outputs": [ @@ -2890,7 +2929,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 285, "id": "north-walter", "metadata": {}, "outputs": [ @@ -2898,25 +2937,49 @@ "name": "stdout", "output_type": "stream", "text": [ - "[16, 7, 4233]\n", - "Tensor(shape=[7, 4233], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", - " [[-0.37638962, -0.82272029, 0.74276292, ..., 0.34200522, 0.01503509, 0.40337229],\n", - " [-0.87386417, -0.31389427, 0.41987872, ..., 0.37723723, -0.14352795, -1.00236630],\n", - " [-0.43505096, 0.03450463, -0.28710306, ..., 0.07727426, -1.16722453, -0.26848495],\n", - " ...,\n", - " [ 0.42471474, 0.58885634, 0.02020410, ..., 0.37405482, 0.04546990, -0.37139422],\n", - " [-0.37978464, -0.81084198, 0.75725073, ..., 0.26038912, -0.00079346, 0.42537683],\n", - " [-0.38279879, -0.81206709, 0.74943423, ..., 0.26172996, -0.00104988, 0.42678767]])\n", - "False\n" + "False\n", + "True\n", + "False\n", + "[[-3.76389682e-01 -8.22720408e-01 7.42762923e-01 ... 3.42005253e-01\n", + " 1.50350705e-02 4.03372347e-01]\n", + " [-8.73864174e-01 -3.13894272e-01 4.19878662e-01 ... 3.77237231e-01\n", + " -1.43528014e-01 -1.00236630e+00]\n", + " [-4.35050905e-01 3.45046446e-02 -2.87102997e-01 ... 7.72742853e-02\n", + " -1.16722476e+00 -2.68485069e-01]\n", + " ...\n", + " [ 4.24714804e-01 5.88856399e-01 2.02039629e-02 ... 3.74054879e-01\n", + " 4.54700664e-02 -3.71394157e-01]\n", + " [-3.79784584e-01 -8.10841978e-01 7.57250786e-01 ... 2.60389000e-01\n", + " -7.93404877e-04 4.25376773e-01]\n", + " [-3.82798851e-01 -8.12067091e-01 7.49434292e-01 ... 2.61730075e-01\n", + " -1.04988366e-03 4.26787734e-01]]\n", + "---\n", + "[[-3.7638968e-01 -8.2272053e-01 7.4276292e-01 ... 3.4200522e-01\n", + " 1.5034772e-02 4.0337229e-01]\n", + " [-8.7386459e-01 -3.1389427e-01 4.1987866e-01 ... 3.7723729e-01\n", + " -1.4352810e-01 -1.0023664e+00]\n", + " [-4.3505096e-01 3.4504786e-02 -2.8710306e-01 ... 7.7274129e-02\n", + " -1.1672243e+00 -2.6848501e-01]\n", + " ...\n", + " [ 4.2471480e-01 5.8885634e-01 2.0203922e-02 ... 3.7405500e-01\n", + " 4.5470044e-02 -3.7139410e-01]\n", + " [-3.7978446e-01 -8.1084180e-01 7.5725085e-01 ... 2.6038891e-01\n", + " -7.9347193e-04 4.2537671e-01]\n", + " [-3.8279903e-01 -8.1206715e-01 7.4943429e-01 ... 2.6173013e-01\n", + " -1.0499060e-03 4.2678756e-01]]\n" ] } ], "source": [ "decoder_out, _ = model.decoder(encoder_out, encoder_mask, ys_in_pad,\n", " ys_in_lens)\n", - "print(decoder_out.shape)\n", - "print(decoder_out[0])\n", - "print(np.allclose(decoder_out.numpy(), torch_decoder_out))" + "\n", + "print(np.allclose(decoder_out.numpy(), torch_decoder_out))\n", + "print(np.allclose(decoder_out.numpy(), torch_decoder_out, atol=1e-6))\n", + "print(np.allclose(decoder_out.numpy(), torch_decoder_out, atol=1e-7))\n", + "print(decoder_out.numpy()[0])\n", + "print('---')\n", + "print(torch_decoder_out[0])" ] }, { @@ -2945,13 +3008,15 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 183, "id": "assisted-fortune", "metadata": {}, "outputs": [], "source": [ "from paddle import nn\n", "import paddle\n", + "from paddle.nn import functional as F\n", + "\n", "class LabelSmoothingLoss(nn.Layer):\n", "\n", " def __init__(self,\n", @@ -3016,7 +3081,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 184, "id": "weighted-delight", "metadata": {}, "outputs": [ @@ -3034,7 +3099,7 @@ " [0.00002363, 0.00002363, 0.00002363, ..., 0.00002363, 0.00002363, 0.00002363],\n", " [0.00002363, 0.00002363, 0.00002363, ..., 0.00002363, 0.00002363, 0.00002363]])\n", "Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n", - " [41.84160995])\n", + " [41.84146118])\n", "VarType.INT64\n" ] } @@ -3049,7 +3114,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 286, "id": "dress-shelter", "metadata": {}, "outputs": [ @@ -3058,7 +3123,7 @@ "output_type": "stream", "text": [ "Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n", - " [41.84160995])\n", + " [41.84146118])\n", "Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", " [41.84146118])\n", "4233\n", @@ -3094,7 +3159,39 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, + "id": "going-hungary", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "naughty-citizenship", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "experimental-emerald", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adverse-saskatchewan", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 27, "id": "speaking-shelf", "metadata": {}, "outputs": [], @@ -3261,7 +3358,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 28, "id": "sharp-municipality", "metadata": {}, "outputs": [], @@ -3351,7 +3448,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 29, "id": "tutorial-syndication", "metadata": {}, "outputs": [], @@ -3377,7 +3474,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 30, "id": "fuzzy-register", "metadata": {}, "outputs": [ @@ -3397,7 +3494,55 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, + "id": "explicit-triumph", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "humanitarian-belgium", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dying-proposal", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "honest-quick", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bound-cholesterol", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "viral-packaging", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 203, "id": "balanced-locator", "metadata": {}, "outputs": [ @@ -3431,7 +3576,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 204, "id": "induced-proposition", "metadata": {}, "outputs": [ @@ -3499,7 +3644,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 205, "id": "cutting-julian", "metadata": {}, "outputs": [ @@ -3833,7 +3978,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 206, "id": "friendly-nightlife", "metadata": {}, "outputs": [ @@ -3940,7 +4085,15 @@ " ...,\n", " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", - " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170]]])\n" + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170]]])\n", + "Tensor(shape=[1, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n", + " [[[ 0. , 1. , 0. , ..., 1. , 0. , 1. ],\n", + " [ 0.84147102, 0.54030228, 0.80196184, ..., 1. , 0.00010746, 1. ],\n", + " [ 0.90929747, -0.41614681, 0.95814437, ..., 1. , 0.00021492, 1. ],\n", + " ...,\n", + " [-0.76825470, -0.64014435, 0.63279730, ..., 0.99998462, 0.00515809, 0.99998671],\n", + " [-0.95375264, 0.30059254, 0.99899054, ..., 0.99998397, 0.00526555, 0.99998611],\n", + " [-0.26237485, 0.96496606, 0.56074661, ..., 0.99998331, 0.00537301, 0.99998558]]])\n" ] } ], @@ -3949,69 +4102,106 @@ "x = model.encoder.embed.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))\n", "print(x)\n", "x, pos_emb = model.encoder.embed.pos_enc(x, 0)\n", - "print(x)" + "print(x)\n", + "print(pos_emb)" ] }, { "cell_type": "code", - "execution_count": 37, - "id": "exempt-cloud", + "execution_count": 207, + "id": "guilty-cache", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Tensor(shape=[16, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", - " [[[-0.54821998, 2.28660274, -1.07501972, ..., 1.45036042, 0.28950194, -0.69454080],\n", - " [-0.80125421, 1.76875579, -1.66388774, ..., 1.83315802, 0.67914939, -0.19995420],\n", - " [-1.71124649, 2.70574546, -1.33634126, ..., 1.23364413, 0.18697014, -0.57351983],\n", - " ...,\n", - " [-0.96968573, 2.31294894, -0.87524825, ..., 0.85838526, 0.48533469, -0.41773027],\n", - " [-1.36094308, 2.17788029, -1.78127730, ..., 2.09278774, 0.25282228, -0.36496443],\n", - " [-1.69674826, 2.35438418, -1.74168527, ..., 1.36695099, 0.59511113, -0.74147725]],\n", - "\n", - " [[-1.98284078, 2.31777000, -0.90785271, ..., 0.41170627, 0.50061619, 0.08721463],\n", - " [-0.76404583, 1.35577726, -1.36125672, ..., 0.73170459, 0.67842603, 0.16851945],\n", - " [-0.95044655, 1.60376561, -1.30299675, ..., 0.57544005, 0.26769355, 0.33433008],\n", - " ...,\n", - " [-1.47567701, 2.53171301, -1.23207152, ..., 1.29967308, 0.50191855, -0.10343577],\n", - " [-1.17308092, 2.31722355, -1.25421047, ..., 1.73911047, 0.21709818, -0.44447583],\n", - " [-1.26996231, 3.22289634, -0.88719147, ..., 1.64605021, 0.09731755, -0.76786882]],\n", - "\n", - " [[-0.58725590, 1.42905438, -1.39500988, ..., 0.21024795, 0.10272825, 0.09179455],\n", - " [ 0.17428070, 1.78342295, -1.64217877, ..., 0.81127012, 0.31371105, 0.56344515],\n", - " [-0.34916472, 1.83103430, -1.06851172, ..., 0.69243336, 0.13782299, 0.45937473],\n", - " ...,\n", - " [-1.08686376, 2.30020404, -1.26384079, ..., 1.79982817, 0.51338923, -0.52227837],\n", - " [-1.26144814, 2.72396612, -1.37337780, ..., 1.44453299, 0.57420933, -0.33201432],\n", - " [-2.20676827, 4.34621811, -3.82886696, ..., 2.14260173, 1.20336640, -1.37951219]],\n", - "\n", - " ...,\n", - "\n", - " [[-0.39141566, 1.85533464, -0.57471782, ..., 1.00623512, 0.46320182, -1.04523599],\n", - " [-0.86054784, 2.01717925, -1.44368529, ..., 1.45262301, 0.16571884, 0.59231722],\n", - " [-0.73066384, 2.28405023, -1.06989920, ..., 1.58249414, -0.09795550, 0.55030036],\n", + "Tensor(shape=[1, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n", + " [[[ 0. , 1. , 0. , ..., 1. , 0. , 1. ],\n", + " [ 0.84147102, 0.54030228, 0.80196184, ..., 1. , 0.00010746, 1. ],\n", + " [ 0.90929747, -0.41614681, 0.95814437, ..., 1. , 0.00021492, 1. ],\n", " ...,\n", - " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", - " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", - " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170]],\n", - "\n", - " [[-0.16194311, 0.62550521, -1.13234293, ..., 0.07242929, -0.22042468, 0.46362036],\n", - " [-0.08306468, 0.57504302, -1.09298003, ..., 0.91096652, -0.06501988, 0.72986233],\n", - " [-0.28202093, 0.08014385, -0.94177192, ..., 0.33794850, -0.11664233, 0.44514441],\n", - " ...,\n", - " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", - " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", - " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170]],\n", - "\n", - " [[-0.54584920, -0.69092435, -1.35965478, ..., -0.78182435, 0.68747747, 0.98427159],\n", - " [ 0.04212743, -1.10618520, -1.43891501, ..., -0.02385022, 0.91146135, 0.52870303],\n", - " [-0.29093450, -0.18858244, -1.54873240, ..., -0.13923697, 0.05795169, 0.30663735],\n", - " ...,\n", - " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", - " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", - " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170]]])\n" + " [-0.76825470, -0.64014435, 0.63279730, ..., 0.99998462, 0.00515809, 0.99998671],\n", + " [-0.95375264, 0.30059254, 0.99899054, ..., 0.99998397, 0.00526555, 0.99998611],\n", + " [-0.26237485, 0.96496606, 0.56074661, ..., 0.99998331, 0.00537301, 0.99998558]]])\n" + ] + } + ], + "source": [ + "print(pos_emb)" + ] + }, + { + "cell_type": "code", + "execution_count": 208, + "id": "iraqi-payday", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[[ 0.0000000e+00 1.0000000e+00 0.0000000e+00 ... 1.0000000e+00\n", + " 0.0000000e+00 1.0000000e+00]\n", + " [ 8.4147096e-01 5.4030234e-01 8.0196178e-01 ... 1.0000000e+00\n", + " 1.0746076e-04 1.0000000e+00]\n", + " [ 9.0929741e-01 -4.1614684e-01 9.5814437e-01 ... 1.0000000e+00\n", + " 2.1492151e-04 1.0000000e+00]\n", + " ...\n", + " [ 9.5625257e-01 -2.9254240e-01 4.8925215e-01 ... 8.3807874e-01\n", + " 5.1154459e-01 8.5925674e-01]\n", + " [ 2.7049953e-01 -9.6272010e-01 9.9170387e-01 ... 8.3801574e-01\n", + " 5.1163691e-01 8.5920173e-01]\n", + " [-6.6394955e-01 -7.4777740e-01 6.9544029e-01 ... 8.3795273e-01\n", + " 5.1172924e-01 8.5914677e-01]]]\n", + "[1, 5000, 256]\n" + ] + } + ], + "source": [ + "import torch\n", + "import math\n", + "import numpy as np\n", + "\n", + "max_len=5000\n", + "d_model=256\n", + "\n", + "pe = torch.zeros(max_len, d_model)\n", + "position = torch.arange(0, max_len,\n", + " dtype=torch.float32).unsqueeze(1)\n", + "toruch_position = position\n", + "div_term = torch.exp(\n", + " torch.arange(0, d_model, 2, dtype=torch.float32) *\n", + " -(math.log(10000.0) / d_model))\n", + "tourch_div_term = div_term.cpu().detach().numpy()\n", + "\n", + "torhc_sin = torch.sin(position * div_term)\n", + "torhc_cos = torch.cos(position * div_term)\n", + "\n", + "np_sin = np.sin((position * div_term).cpu().detach().numpy())\n", + "np_cos = np.cos((position * div_term).cpu().detach().numpy())\n", + "pe[:, 0::2] = torhc_sin\n", + "pe[:, 1::2] = torhc_cos\n", + "pe = pe.unsqueeze(0) \n", + "tourch_pe = pe.cpu().detach().numpy()\n", + "print(tourch_pe)\n", + "bak_pe = model.encoder.embed.pos_enc.pe\n", + "print(bak_pe.shape)\n", + "model.encoder.embed.pos_enc.pe = paddle.to_tensor(tourch_pe)" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "id": "exempt-cloud", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "True\n" ] } ], @@ -4020,7 +4210,12 @@ "masks = make_non_pad_mask(feat_len).unsqueeze(1)\n", "\n", "xs, pos_emb, masks = model.encoder.embed(xs, masks.type_as(xs), offset=0)\n", - "print(xs)" + "#print(xs)\n", + "data = np.load(\".notebook/enc_embed.npz\")\n", + "torch_pos_emb=data['pos_emb']\n", + "torch_xs = data['embed_out']\n", + "print(np.allclose(xs.numpy(), torch_xs))\n", + "print(np.allclose(pos_emb.numpy(), torch_pos_emb))" ] }, { @@ -4029,45 +4224,361 @@ "id": "composite-involvement", "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 269, + "id": "handed-harris", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "False\n", + "True\n", + "[256, 2048]\n", + "[2048]\n", + "[2048, 256]\n", + "[256]\n", + "--------ff-------\n", + "True\n", + "False\n", + "False\n", + "False\n", + "False\n", + "True\n", + "linear_714.w_0 True\n", + "linear_714.b_0 True\n", + "linear_715.w_0 True\n", + "linear_715.b_0 True\n", + "False\n", + "True\n" + ] + } + ], + "source": [ + "xs = model.encoder.global_cmvn(feat)\n", + "masks = make_non_pad_mask(feat_len).unsqueeze(1)\n", + "\n", + "xs, pos_emb, masks = model.encoder.embed(xs, masks.type_as(xs), offset=0)\n", + "masks = masks.astype(paddle.bool)\n", + "mask_pad = masks.logical_not()\n", + "decoding_chunk_size=0\n", + "num_decoding_left_chunks=-1\n", + "chunk_masks = add_optional_chunk_mask(\n", + " xs, masks, model.encoder.use_dynamic_chunk, model.encoder.use_dynamic_left_chunk,\n", + " decoding_chunk_size, model.encoder.static_chunk_size,\n", + " num_decoding_left_chunks)\n", + "\n", + "#print(chunk_masks)\n", + "data = np.load(\".notebook/enc_embed.npz\")\n", + "torch_pos_emb=data['pos_emb']\n", + "torch_xs = data['embed_out']\n", + "torch_chunk_masks = data['chunk_masks']\n", + "torch_mask_pad = data['mask_pad']\n", + "print(np.allclose(xs.numpy(), torch_xs))\n", + "print(np.allclose(pos_emb.numpy(), torch_pos_emb))\n", + "np.testing.assert_equal(chunk_masks.numpy(), torch_chunk_masks)\n", + "np.testing.assert_equal(mask_pad.numpy(), ~torch_mask_pad)\n", + "\n", + "for layer in model.encoder.encoders:\n", + " #xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad)\n", + " print(layer.feed_forward_macaron is not None)\n", + " print(layer.normalize_before)\n", + " \n", + " data = np.load('.notebook/enc_0_norm_ff.npz')\n", + " t_norm_ff = data['norm_ff']\n", + " t_xs = data['xs']\n", + " \n", + " \n", + " x = xs\n", + " print(np.allclose(t_xs, x.numpy()))\n", + " residual = x\n", + " print(np.allclose(t_xs, residual.numpy()))\n", + " x_nrom = layer.norm_ff_macaron(x)\n", + " print(np.allclose(t.numpy(), x_nrom.numpy()))\n", + " print(np.allclose(t_norm_ff, x_nrom.numpy()))\n", + "# for n, p in layer.norm_ff_macaron.state_dict().items():\n", + "# print(n, p)\n", + "# pass\n", + "\n", + " layer.eval()\n", + " x_nrom = paddle.to_tensor(t_norm_ff)\n", + " print(np.allclose(t_norm_ff, x_nrom.numpy()))\n", + " x = residual + layer.ff_scale * layer.feed_forward_macaron(x_nrom)\n", + " \n", + " ps=[]\n", + " for n, p in layer.feed_forward_macaron.state_dict().items():\n", + " #print(n, p)\n", + " ps.append(p)\n", + " print(p.shape)\n", + " pass\n", + "\n", + " x_nrom = paddle.to_tensor(t_norm_ff)\n", + " ff_l_x = layer.feed_forward_macaron.w_1(x_nrom)\n", + " ff_l_a_x = layer.feed_forward_macaron.activation(ff_l_x)\n", + " ff_l_a_l_x = layer.feed_forward_macaron.w_2(ff_l_a_x)\n", + " data = np.load('.notebook/enc_0_ff_out.npz', allow_pickle=True)\n", + " t_norm_ff = data['norm_ff']\n", + " t_ff_out = data['ff_out']\n", + " t_ff_l_x = data['ff_l_x']\n", + " t_ff_l_a_x = data['ff_l_a_x']\n", + " t_ff_l_a_l_x = data['ff_l_a_l_x']\n", + " t_ps = data['ps']\n", + " \n", + " print(\"--------ff-------\")\n", + " print(np.allclose(x_nrom.numpy(), t_norm_ff))\n", + " print(np.allclose(x.numpy(), t_ff_out))\n", + " print(np.allclose(ff_l_x.numpy(), t_ff_l_x))\n", + " print(np.allclose(ff_l_a_x.numpy(), t_ff_l_a_x))\n", + " print(np.allclose(ff_l_a_l_x.numpy(), t_ff_l_a_l_x))\n", + " \n", + " print(np.allclose(ff_l_x.numpy(), t_ff_l_x, atol=1e-6))\n", + " for p, t_p in zip(ps, t_ps):\n", + " print(p.name, np.allclose(p.numpy(), t_p.T))\n", + " \n", + " \n", + "# residual = x\n", + "# x = layer.norm_mha(x)\n", + "# x_q = x\n", + " \n", + " data = np.load('.notebook/enc_0_selattn_out.npz', allow_pickle=True)\n", + " tx_q = data['x_q']\n", + " tx = data['x']\n", + " tpos_emb=data['pos_emb']\n", + " tmask=data['mask']\n", + " tt_x_att=data['x_att']\n", + " x_q = paddle.to_tensor(tx_q)\n", + " x = paddle.to_tensor(tx)\n", + " pos_emb = paddle.to_tensor(tpos_emb)\n", + " mask = paddle.to_tensor(tmask)\n", + " \n", + " x_att = layer.self_attn(x_q, x, x, pos_emb, mask)\n", + " print(np.allclose(x_att.numpy(), t_x_att))\n", + " print(np.allclose(x_att.numpy(), t_x_att, atol=1e-6))\n", + " \n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 270, + "id": "sonic-thumb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "True\n", + "False\n", + "True\n" + ] + } + ], "source": [ - "\n" + "xs = model.encoder.global_cmvn(feat)\n", + "masks = make_non_pad_mask(feat_len).unsqueeze(1)\n", + "\n", + "xs, pos_emb, masks = model.encoder.embed(xs, masks.type_as(xs), offset=0)\n", + "masks = masks.astype(paddle.bool)\n", + "mask_pad = masks.logical_not()\n", + "decoding_chunk_size=0\n", + "num_decoding_left_chunks=-1\n", + "chunk_masks = add_optional_chunk_mask(\n", + " xs, masks, model.encoder.use_dynamic_chunk, model.encoder.use_dynamic_left_chunk,\n", + " decoding_chunk_size, model.encoder.static_chunk_size,\n", + " num_decoding_left_chunks)\n", + "\n", + "#print(chunk_masks)\n", + "data = np.load(\".notebook/enc_embed.npz\")\n", + "torch_pos_emb=data['pos_emb']\n", + "torch_xs = data['embed_out']\n", + "torch_chunk_masks = data['chunk_masks']\n", + "torch_mask_pad = data['mask_pad']\n", + "print(np.allclose(xs.numpy(), torch_xs))\n", + "print(np.allclose(pos_emb.numpy(), torch_pos_emb))\n", + "np.testing.assert_equal(chunk_masks.numpy(), torch_chunk_masks)\n", + "np.testing.assert_equal(mask_pad.numpy(), ~torch_mask_pad)\n", + "\n", + "\n", + "for layer in model.encoder.encoders:\n", + " xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad)\n", + " break\n", + "data = np.load('.notebook/enc_0.npz')\n", + "torch_xs = data['enc_0']\n", + "print(np.allclose(xs.numpy(), torch_xs))\n", + "print(np.allclose(xs.numpy(), torch_xs, atol=1e-6))\n" ] }, { "cell_type": "code", - "execution_count": 43, - "id": "handed-harris", + "execution_count": 273, + "id": "brave-latino", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "True\n", + "--------layers_______\n", + "False\n", + "True\n", + "[[-0.70194244 0.56254214 0.6880346 ... 1.1237319 0.7803924\n", + " 1.1369387 ]\n", + " [-0.7787783 0.3912667 0.71887773 ... 1.251882 0.886168\n", + " 1.3173451 ]\n", + " [-0.95908964 0.6346029 0.87671334 ... 0.98183745 0.7440111\n", + " 1.2903278 ]\n", + " ...\n", + " [-1.0732255 0.67236906 0.92303115 ... 0.9075458 0.8176712\n", + " 1.3239655 ]\n", + " [-1.1654118 0.6819967 0.6939453 ... 1.2238353 0.8028295\n", + " 1.4506507 ]\n", + " [-1.2732092 0.7145806 0.75819594 ... 0.94154835 0.8774845\n", + " 1.2623049 ]]\n", + "xxxxxx\n", + "[[-0.7019424 0.56254166 0.6880345 ... 1.1237322 0.78039217\n", + " 1.1369387 ]\n", + " [-0.778778 0.39126638 0.7188779 ... 1.2518823 0.8861681\n", + " 1.3173454 ]\n", + " [-0.9590891 0.6346026 0.87671363 ... 0.9818373 0.74401116\n", + " 1.2903274 ]\n", + " ...\n", + " [-1.0732253 0.6723689 0.9230311 ... 0.9075457 0.8176713\n", + " 1.3239657 ]\n", + " [-1.165412 0.6819976 0.69394535 ... 1.2238353 0.80282927\n", + " 1.4506509 ]\n", + " [-1.273209 0.71458095 0.75819623 ... 0.9415484 0.8774842\n", + " 1.2623055 ]]\n" + ] + } + ], + "source": [ + "xs = model.encoder.global_cmvn(feat)\n", + "masks = make_non_pad_mask(feat_len).unsqueeze(1)\n", + "\n", + "xs, pos_emb, masks = model.encoder.embed(xs, masks.type_as(xs), offset=0)\n", + "masks = masks.astype(paddle.bool)\n", + "mask_pad = masks.logical_not()\n", + "decoding_chunk_size=0\n", + "num_decoding_left_chunks=-1\n", + "chunk_masks = add_optional_chunk_mask(\n", + " xs, masks, model.encoder.use_dynamic_chunk, model.encoder.use_dynamic_left_chunk,\n", + " decoding_chunk_size, model.encoder.static_chunk_size,\n", + " num_decoding_left_chunks)\n", + "\n", + "#print(chunk_masks)\n", + "data = np.load(\".notebook/enc_embed.npz\")\n", + "torch_pos_emb=data['pos_emb']\n", + "torch_xs = data['embed_out']\n", + "torch_chunk_masks = data['chunk_masks']\n", + "torch_mask_pad = data['mask_pad']\n", + "print(np.allclose(xs.numpy(), torch_xs))\n", + "print(np.allclose(pos_emb.numpy(), torch_pos_emb))\n", + "np.testing.assert_equal(chunk_masks.numpy(), torch_chunk_masks)\n", + "np.testing.assert_equal(mask_pad.numpy(), ~torch_mask_pad)\n", + "\n", + "print(\"--------layers_______\")\n", + "i =0\n", + "for layer in model.encoder.encoders:\n", + " xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad)\n", + " i+=1\n", + "# if i == 2:\n", + "# data = np.load('.notebook/enc_2.npz')\n", + "# torch_xs = data['enc_2']\n", + "# print(np.allclose(xs.numpy(), torch_xs))\n", + "# print(np.allclose(xs.numpy(), torch_xs, atol=1e-5))\n", + "# print(xs[0].numpy())\n", + "# print('xxxxxx')\n", + "# print(torch_xs[0])\n", + "# print('----i==2')\n", + "data = np.load('.notebook/enc_all.npz')\n", + "torch_xs = data['enc_all']\n", + "print(np.allclose(xs.numpy(), torch_xs))\n", + "print(np.allclose(xs.numpy(), torch_xs, atol=1e-5))\n", + "print(xs[0].numpy())\n", + "print('xxxxxx')\n", + "print(torch_xs[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "municipal-stock", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 278, + "id": "macro-season", "metadata": {}, "outputs": [ { - "ename": "SystemError", - "evalue": "(Fatal) Operator elementwise_sub raises an paddle::memory::allocation::BadAlloc exception.\nThe exception content is\n:ResourceExhaustedError: \n\nOut of memory error on GPU 0. Cannot allocate 1.010986MB memory on GPU 0, available memory is only 6.437500MB.\n\nPlease check whether there is any other process using GPU 0.\n1. If yes, please stop them, or start PaddlePaddle on another GPU.\n2. If no, please decrease the batch size of your model. \n\n (at /paddle/paddle/fluid/memory/allocation/cuda_allocator.cc:69)\n. (at /paddle/paddle/fluid/imperative/tracer.cc:172)\n", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mSystemError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mencoder_out\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoder_mask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoder\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeat_len\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mencoder_out\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mencoder_mask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mencoder_out\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch_encoder_out\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dygraph/layers.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m 900\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_built\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 902\u001b[0;31m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 903\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 904\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mforward_post_hook\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_forward_post_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/workspace/DeepSpeech-2.x/deepspeech/modules/encoder.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, xs, xs_lens, decoding_chunk_size, num_decoding_left_chunks)\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mglobal_cmvn\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 160\u001b[0;31m \u001b[0mxs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mglobal_cmvn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 161\u001b[0m \u001b[0;31m#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[0mxs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpos_emb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmasks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0membed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmasks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype_as\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moffset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dygraph/layers.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m 900\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_built\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 902\u001b[0;31m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 903\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 904\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mforward_post_hook\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_forward_post_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/workspace/DeepSpeech-2.x/deepspeech/modules/cmvn.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mpaddle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTensor\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnormalized\u001b[0m \u001b[0mfeature\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 47\u001b[0m \"\"\"\n\u001b[0;32m---> 48\u001b[0;31m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 49\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnorm_var\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mistd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dygraph/math_op_patch.py\u001b[0m in \u001b[0;36m__impl__\u001b[0;34m(self, other_var)\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0maxis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 248\u001b[0m \u001b[0mmath_op\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mops\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 249\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmath_op\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother_var\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'axis'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 250\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 251\u001b[0m \u001b[0mcomment\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mOpProtoHolder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_op_proto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mop_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcomment\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mSystemError\u001b[0m: (Fatal) Operator elementwise_sub raises an paddle::memory::allocation::BadAlloc exception.\nThe exception content is\n:ResourceExhaustedError: \n\nOut of memory error on GPU 0. Cannot allocate 1.010986MB memory on GPU 0, available memory is only 6.437500MB.\n\nPlease check whether there is any other process using GPU 0.\n1. If yes, please stop them, or start PaddlePaddle on another GPU.\n2. If no, please decrease the batch size of your model. \n\n (at /paddle/paddle/fluid/memory/allocation/cuda_allocator.cc:69)\n. (at /paddle/paddle/fluid/imperative/tracer.cc:172)\n" + "name": "stdout", + "output_type": "stream", + "text": [ + "[[-0.7019424 0.5625421 0.68803453 ... 1.1237317 0.7803923\n", + " 1.1369386 ]\n", + " [-0.7787783 0.39126673 0.71887773 ... 1.251882 0.886168\n", + " 1.3173451 ]\n", + " [-0.95908964 0.6346029 0.87671334 ... 0.98183745 0.7440111\n", + " 1.2903278 ]\n", + " ...\n", + " [-1.0732255 0.67236906 0.92303115 ... 0.9075458 0.8176712\n", + " 1.3239655 ]\n", + " [-1.1654117 0.68199664 0.6939452 ... 1.2238352 0.8028294\n", + " 1.4506506 ]\n", + " [-1.2732091 0.71458054 0.7581958 ... 0.9415482 0.8774844\n", + " 1.2623048 ]]\n", + "---\n", + "[[-0.7019424 0.56254166 0.6880345 ... 1.1237322 0.78039217\n", + " 1.1369387 ]\n", + " [-0.778778 0.39126638 0.7188779 ... 1.2518823 0.8861681\n", + " 1.3173454 ]\n", + " [-0.9590891 0.6346026 0.87671363 ... 0.9818373 0.74401116\n", + " 1.2903274 ]\n", + " ...\n", + " [-1.0732253 0.6723689 0.9230311 ... 0.9075457 0.8176713\n", + " 1.3239657 ]\n", + " [-1.165412 0.6819976 0.69394535 ... 1.2238353 0.80282927\n", + " 1.4506509 ]\n", + " [-1.2732087 0.71458083 0.7581961 ... 0.9415482 0.877484\n", + " 1.2623053 ]]\n", + "False\n", + "True\n", + "False\n" ] } ], "source": [ - "encoder_out, encoder_mask = model.encoder(feat, feat_len)\n", - "print(encoder_out.shape)\n", - "print(encoder_mask.shape)\n", - "print(encoder_out[0])\n", - "print(torch_encoder_out[0])" + "encoder_out, mask = model.encoder(feat, feat_len)\n", + "print(encoder_out.numpy()[0])\n", + "print(\"---\")\n", + "print(torch_encoder_out[0])\n", + "print(np.allclose(torch_encoder_out, encoder_out.numpy()))\n", + "print(np.allclose(torch_encoder_out, encoder_out.numpy(), atol=1e-5))\n", + "print(np.allclose(torch_encoder_out, encoder_out.numpy(), atol=1e-6))" ] }, { "cell_type": "code", "execution_count": null, - "id": "sonic-thumb", + "id": "associate-sampling", "metadata": {}, "outputs": [], "source": []