diff --git a/.notebook/position_embeding_check.ipynb b/.notebook/position_embeding_check.ipynb index f7410c256b69dabfdaaebb2ff6fa8e45aa83ec31..d4b9098d989c40f5ceb3e36842354336c8d280dc 100644 --- a/.notebook/position_embeding_check.ipynb +++ b/.notebook/position_embeding_check.ipynb @@ -2,10 +2,18 @@ "cells": [ { "cell_type": "code", - "execution_count": 29, + "execution_count": 2, "id": "designing-borough", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", + " and should_run_async(code)\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -61,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 5, "id": "swiss-referral", "metadata": {}, "outputs": [ @@ -74,13 +82,69 @@ "False\n", "False\n", "False\n", - "False\n" + "False\n", + "[[ 1. 1. 1. ... 1. 1.\n", + " 1. ]\n", + " [ 0.5403023 0.59737533 0.6479059 ... 1. 1.\n", + " 1. ]\n", + " [-0.41614684 -0.28628543 -0.1604359 ... 0.99999994 1.\n", + " 1. ]\n", + " ...\n", + " [-0.92514753 -0.66694194 -0.67894876 ... 0.9999276 0.99993724\n", + " 0.9999457 ]\n", + " [-0.81928825 -0.9959641 -0.999139 ... 0.99992603 0.999936\n", + " 0.99994457]\n", + " [ 0.03982088 -0.52298605 -0.6157435 ... 0.99992454 0.9999347\n", + " 0.99994344]]\n", + "----\n", + "[[ 1. 1. 1. ... 1. 1.\n", + " 1. ]\n", + " [ 0.54030234 0.59737533 0.6479059 ... 1. 1.\n", + " 1. ]\n", + " [-0.41614684 -0.28628543 -0.1604359 ... 1. 1.\n", + " 1. ]\n", + " ...\n", + " [-0.92514753 -0.66694194 -0.67894876 ... 0.9999276 0.9999373\n", + " 0.9999457 ]\n", + " [-0.81928825 -0.9959641 -0.999139 ... 0.99992603 0.999936\n", + " 0.99994457]\n", + " [ 0.03982088 -0.5229861 -0.6157435 ... 0.99992454 0.9999347\n", + " 0.99994344]]\n", + ")))))))\n", + "[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n", + " 0.0000000e+00 0.0000000e+00]\n", + " [ 8.4147096e-01 8.0196178e-01 7.6172036e-01 ... 1.2409373e-04\n", + " 1.1547816e-04 1.0746076e-04]\n", + " [ 9.0929741e-01 9.5814437e-01 9.8704624e-01 ... 2.4818745e-04\n", + " 2.3095631e-04 2.1492151e-04]\n", + " ...\n", + " [ 3.7960774e-01 7.4510968e-01 7.3418564e-01 ... 1.2036801e-02\n", + " 1.1201146e-02 1.0423505e-02]\n", + " [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ... 1.2160885e-02\n", + " 1.1316618e-02 1.0530960e-02]\n", + " [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ... 1.2284970e-02\n", + " 1.1432089e-02 1.0638415e-02]]\n", + "----\n", + "[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n", + " 0.0000000e+00 0.0000000e+00]\n", + " [ 8.4147096e-01 8.0196178e-01 7.6172036e-01 ... 1.2409373e-04\n", + " 1.1547816e-04 1.0746076e-04]\n", + " [ 9.0929741e-01 9.5814437e-01 9.8704624e-01 ... 2.4818745e-04\n", + " 2.3095631e-04 2.1492151e-04]\n", + " ...\n", + " [ 3.7960774e-01 7.4510968e-01 7.3418564e-01 ... 1.2036801e-02\n", + " 1.1201146e-02 1.0423505e-02]\n", + " [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ... 1.2160885e-02\n", + " 1.1316618e-02 1.0530960e-02]\n", + " [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ... 1.2284970e-02\n", + " 1.1432089e-02 1.0638415e-02]]\n" ] } ], "source": [ "import paddle\n", - "ppe = paddle.zeros([max_len, d_model])\n", + "paddle.set_device('cpu')\n", + "ppe = paddle.zeros((max_len, d_model), dtype='float32')\n", "position = paddle.arange(0, max_len,\n", " dtype='float32').unsqueeze(1)\n", "print(np.allclose(position.numpy(), toruch_position))\n", @@ -98,12 +162,19 @@ "ppe[:, 0::2] = p_sin\n", "ppe[:, 1::2] = p_cos\n", "print(np.allclose(p_sin.numpy(), torhc_sin.cpu().detach().numpy()))\n", - "print(np.allclose(p_cos.numpy(), torhc_cos.cpu().detach().numpy()))" + "print(np.allclose(p_cos.numpy(), torhc_cos.cpu().detach().numpy()))\n", + "print(p_cos.numpy())\n", + "print(\"----\")\n", + "print(torhc_cos.cpu().detach().numpy())\n", + "print(\")))))))\")\n", + "print(p_sin.numpy())\n", + "print(\"----\")\n", + "print(torhc_sin.cpu().detach().numpy())" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 4, "id": "integrated-boards", "metadata": {}, "outputs": [ diff --git a/deepspeech/utils/layer_tools.py b/deepspeech/utils/layer_tools.py index c05982c14dfce73ba290d0c75c62f4ccb128d558..f1fd50aa197ff5fdcde34e1d83f2985f8e87a5f0 100644 --- a/deepspeech/utils/layer_tools.py +++ b/deepspeech/utils/layer_tools.py @@ -31,7 +31,8 @@ def summary(layer: nn.Layer, print_func=print): num_elements += np.prod(param.shape) num_params += 1 if print_func: - print_func(f"Total parameters: {num_params}, {num_elements} elements.") + num_elements = num_elements / 1024**3 + print_func(f"Total parameters: {num_params}, {num_elements}G elements.") def print_grads(model, print_func=print): @@ -54,7 +55,8 @@ def print_params(model, print_func=print): if print_func: print_func(msg) if print_func: - print_func(f"Total parameters: {num_params}, {total} elements.") + total = total / 1024**3 + print_func(f"Total parameters: {num_params}, {total}G elements.") def gradient_norm(layer: nn.Layer): diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml index 7c7d866fbb47126c2b0ad0904d8849226dc9c365..55680e9eb5ecd7e13e987c191a4595db3c58254b 100644 --- a/examples/aishell/s1/conf/conformer.yaml +++ b/examples/aishell/s1/conf/conformer.yaml @@ -84,7 +84,7 @@ training: scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 - log_interval: 1 + log_interval: 100 decoding: