add test; attention

16fa4245 · Hui Zhang · bc6da7a1 · 16fa4245 · 16fa4245 · 16fa4245
5 changed file
--- a/.notebook/hack_api_test.ipynb
+++ b/.notebook/hack_api_test.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "breeding-haven",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/home/ssd5/zhanghui/DeepSpeech2.x\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'/home/ssd5/zhanghui/DeepSpeech2.x'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%cd ..\n",
+    "%pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "appropriate-theta",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LICENSE       deepspeech  examples\t\t    requirements.txt  tools\r\n",
+      "README.md     docs\t  libsndfile-1.0.28\t    setup.sh\t      utils\r\n",
+      "README_cn.md  env.sh\t  libsndfile-1.0.28.tar.gz  tests\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "entire-bloom",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  def convert_to_list(value, n, name, dtype=np.int):\n",
+      "WARNING:root:override cat of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "WARNING:root:register user masked_fill to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user repeat to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user glu to paddle.nn.functional, remove this when fixed!\n",
+      "WARNING:root:register user GLU to paddle.nn, remove this when fixed!\n",
+      "WARNING:root:register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
+      "WARNING:root:override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n"
+     ]
+    }
+   ],
+   "source": [
+    "from deepspeech.modules import loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "governmental-aircraft",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import paddle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "proprietary-disaster",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<function deepspeech.modules.repeat(xs: paddle.VarBase, *size: Any) -> paddle.VarBase>"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "paddle.Tensor.repeat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "first-diagram",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<property at 0x7fb515eeeb88>"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "paddle.Tensor.size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "intelligent-david",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<function paddle.tensor.manipulation.concat(x, axis=0, name=None)>"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "paddle.Tensor.cat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "bronze-tenant",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = paddle.to_tensor([12,32, 10, 12, 123,32 ,4])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "balanced-bearing",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a.size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "extreme-republic",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:\n",
+    "    nargs = len(args)\n",
+    "    assert (nargs <= 1)\n",
+    "    s = paddle.shape(xs)\n",
+    "    if nargs == 1:\n",
+    "        return s[args[0]]\n",
+    "    else:\n",
+    "        return s\n",
+    "\n",
+    "# logger.warn(\n",
+    "#     \"override size of paddle.Tensor if exists or register, remove this when fixed!\"\n",
+    "# )\n",
+    "paddle.Tensor.size = size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "gross-addiction",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
+       "       [7])"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a.size(0)\n",
+    "a.size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "adverse-dining",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
+       "       [7])"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a.size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "popular-potato",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/.notebook/jit_infer.ipynb
+++ b/.notebook/jit_infer.ipynb
--- a/deepspeech/modules/__init__.py
+++ b/deepspeech/modules/__init__.py
@@ -43,20 +43,53 @@ if not hasattr(paddle.Tensor, 'cat'):
    paddle.Tensor.cat = paddle.Tensor.concat
+def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor:
+    return xs.equal(paddle.to_tensor(ys, dtype=xs.dtype, place=xs.place))
+if not hasattr(paddle.Tensor, 'eq'):
+    logger.warn(
+        "override eq of paddle.Tensor if exists or register, remove this when fixed!"
+    )
+    paddle.Tensor.eq = eq
+def contiguous(xs: paddle.Tensor) -> paddle.Tensor:
+    return xs
+if not hasattr(paddle.Tensor, 'contiguous'):
+    logger.warn(
+        "override contiguous of paddle.Tensor if exists or register, remove this when fixed!"
+    )
+    paddle.Tensor.contiguous = contiguous
 def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
    nargs = len(args)
    assert (nargs <= 1)
    s = paddle.shape(xs)
    if nargs == 1:
-        return s[args]
+        return s[args[0]]
    else:
        return s
-# logger.warn(
+#`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
-#     "override size of paddle.Tensor if exists or register, remove this when fixed!"
+logger.warn(
-# )
+    "override size of paddle.Tensor "
-# paddle.Tensor.size = size
+    "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
+)
+paddle.Tensor.size = size
+def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
+    return xs.reshape(args)
+if not hasattr(paddle.Tensor, 'view'):
+    logger.warn("register user view to paddle.Tensor, remove this when fixed!")
+    paddle.Tensor.view = view
 def masked_fill(xs: paddle.Tensor,
@@ -185,6 +218,14 @@ if not hasattr(paddle.nn, 'ConstantPad2d'):
        "register user ConstantPad2d to paddle.nn, remove this when fixed!")
    setattr(paddle.nn, 'ConstantPad2d', ConstantPad2d)
+if not hasattr(paddle, 'softmax'):
+    logger.warn("register user softmax to paddle, remove this when fixed!")
+    setattr(paddle, 'softmax', paddle.nn.functional.softmax)
+if not hasattr(paddle, 'sigmoid'):
+    logger.warn("register user softmax to paddle, remove this when fixed!")
+    setattr(paddle, 'sigmoid', paddle.nn.functional.sigmoid)
 # hack loss
 def ctc_loss(logits,

--- a/deepspeech/modules/attention.py
+++ b/deepspeech/modules/attention.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multi-Head Attention layer definition."""
+import math
+import logging
+from typing import Optional, Tuple
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+from paddle.nn import initializer as I
+logger = logging.getLogger(__name__)
+__all__ = ["MultiHeadedAttention", "RelPositionMultiHeadedAttention"]
+class MultiHeadedAttention(nn.Layer):
+    """Multi-Head Attention layer."""
+    def __init__(self, n_head: int, n_feat: int, dropout_rate: float):
+        """Construct an MultiHeadedAttention object.
+        Args:
+            n_head (int): The number of heads.
+            n_feat (int): The number of features.
+            dropout_rate (float): Dropout rate.
+        """
+        super().__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.dropout = nn.Dropout(p=dropout_rate)
+    def forward_qkv(self,
+                    query: paddle.Tensor,
+                    key: paddle.Tensor,
+                    value: paddle.Tensor
+                    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Transform query, key and value.
+        Args:
+            query (paddle.Tensor): Query tensor (#batch, time1, size).
+            key (paddle.Tensor): Key tensor (#batch, time2, size).
+            value (paddle.Tensor): Value tensor (#batch, time2, size).
+        Returns:
+            paddle.Tensor: Transformed query tensor, size
+                (#batch, n_head, time1, d_k).
+            paddle.Tensor: Transformed key tensor, size
+                (#batch, n_head, time2, d_k).
+            paddle.Tensor: Transformed value tensor, size
+                (#batch, n_head, time2, d_k).
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose([0, 2, 1, 3])  # (batch, head, time1, d_k)
+        k = k.transpose([0, 2, 1, 3])  # (batch, head, time2, d_k)
+        v = v.transpose([0, 2, 1, 3])  # (batch, head, time2, d_k)
+        return q, k, v
+    def forward_attention(self,
+                          value: paddle.Tensor,
+                          scores: paddle.Tensor,
+                          mask: Optional[paddle.Tensor]) -> paddle.Tensor:
+        """Compute attention context vector.
+        Args:
+            value (paddle.Tensor): Transformed value, size
+                (#batch, n_head, time2, d_k).
+            scores (paddle.Tensor): Attention score, size
+                (#batch, n_head, time1, time2).
+            mask (paddle.Tensor): Mask, size (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            paddle.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+        """
+        n_batch = value.size(0)
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            scores = scores.masked_fill(mask, -float('inf'))
+            attn = paddle.softmax(
+                scores, axis=-1).masked_fill(mask,
+                                             0.0)  # (batch, head, time1, time2)
+        else:
+            attn = paddle.softmax(
+                scores, axis=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(attn)
+        x = paddle.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = x.transpose([0, 2, 1, 3]).contiguous().view(
+            n_batch, -1, self.h * self.d_k)  # (batch, time1, d_model)
+        return self.linear_out(x)  # (batch, time1, d_model)
+    def forward(self,
+                query: paddle.Tensor,
+                key: paddle.Tensor,
+                value: paddle.Tensor,
+                mask: Optional[paddle.Tensor]) -> paddle.Tensor:
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        scores = paddle.matmul(q, k.transpose(
+            [0, 1, 3, 2])) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask)
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding."""
+    def __init__(self, n_head, n_feat, dropout_rate):
+        """Construct an RelPositionMultiHeadedAttention object.
+        Paper: https://arxiv.org/abs/1901.02860
+        Args:
+            n_head (int): The number of heads.
+            n_feat (int): The number of features.
+            dropout_rate (float): Dropout rate.
+        """
+        super().__init__(n_head, n_feat, dropout_rate)
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+    def rel_shift(self, x, zero_triu: bool=False):
+        """Compute relative positinal encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, size).
+            zero_triu (bool): If true, return the lower triangular part of
+                the matrix.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+        zero_pad = torch.zeros(
+            (x.size()[0], x.size()[1], x.size()[2], 1),
+            device=x.device,
+            dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+        x_padded = x_padded.view(x.size()[0],
+                                 x.size()[1], x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)
+        if zero_triu:
+            ones = torch.ones((x.size(2), x.size(3)))
+            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+        return x
+    def forward(self,
+                query: torch.Tensor,
+                key: torch.Tensor,
+                value: torch.Tensor,
+                pos_emb: torch.Tensor,
+                mask: Optional[torch.Tensor]):
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+        # compute matrix b and matrix d
+        # (batch, head, time1, time2)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        # Remove rel_shift since it is useless in speech recognition,
+        # and it requires special attention for streaming.
+        # matrix_bd = self.rel_shift(matrix_bd)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(
+            self.d_k)  # (batch, head, time1, time2)
+        return self.forward_attention(v, scores, mask)
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ python_speech_features
 tensorboardX
 yacs
 typeguard
+pre-commit