deepspeech2.py 17.0 KB
Newer Older
H
huangyuxin 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
H
huangyuxin 已提交
14
"""Deepspeech2 ASR Online Model"""
H
huangyuxin 已提交
15 16 17
from typing import Optional

import paddle
H
huangyuxin 已提交
18
import paddle.nn.functional as F
H
huangyuxin 已提交
19
from paddle import nn
H
huangyuxin 已提交
20 21
from yacs.config import CfgNode

H
huangyuxin 已提交
22
from deepspeech.models.ds2_online.conv import Conv2dSubsampling4Online
H
huangyuxin 已提交
23
from deepspeech.modules.ctc import CTCDecoder
H
huangyuxin 已提交
24 25 26 27 28
from deepspeech.utils import layer_tools
from deepspeech.utils.checkpoint import Checkpoint
from deepspeech.utils.log import Log
logger = Log(__name__).getlog()

29
__all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModelOnline']
H
huangyuxin 已提交
30 31 32 33 34 35 36


class CRNNEncoder(nn.Layer):
    def __init__(self,
                 feat_size,
                 dict_size,
                 num_conv_layers=2,
H
huangyuxin 已提交
37
                 num_rnn_layers=4,
H
huangyuxin 已提交
38
                 rnn_size=1024,
39
                 rnn_direction='forward',
H
huangyuxin 已提交
40 41
                 num_fc_layers=2,
                 fc_layers_size_list=[512, 256],
42
                 use_gru=False):
H
huangyuxin 已提交
43 44 45 46 47
        super().__init__()
        self.rnn_size = rnn_size
        self.feat_size = feat_size  # 161 for linear
        self.dict_size = dict_size
        self.num_rnn_layers = num_rnn_layers
H
huangyuxin 已提交
48
        self.num_fc_layers = num_fc_layers
49
        self.rnn_direction = rnn_direction
H
huangyuxin 已提交
50
        self.fc_layers_size_list = fc_layers_size_list
51
        self.use_gru = use_gru
H
huangyuxin 已提交
52
        self.conv = Conv2dSubsampling4Online(feat_size, 32, dropout_rate=0.0)
H
huangyuxin 已提交
53

54
        self.output_dim = self.conv.output_dim
H
huangyuxin 已提交
55

56
        i_size = self.conv.output_dim
H
huangyuxin 已提交
57 58 59
        self.rnn = nn.LayerList()
        self.layernorm_list = nn.LayerList()
        self.fc_layers_list = nn.LayerList()
60 61 62 63 64 65
        if rnn_direction == 'bidirect' or rnn_direction == 'bidirectional':
            layernorm_size = 2 * rnn_size
        elif rnn_direction == 'forward':
            layernorm_size = rnn_size
        else:
            raise Exception("Wrong rnn direction")
66 67 68 69
        for i in range(0, num_rnn_layers):
            if i == 0:
                rnn_input_size = i_size
            else:
70
                rnn_input_size = layernorm_size
71
            if use_gru is True:
H
huangyuxin 已提交
72
                self.rnn.append(
H
huangyuxin 已提交
73
                    nn.GRU(
74
                        input_size=rnn_input_size,
H
huangyuxin 已提交
75 76 77
                        hidden_size=rnn_size,
                        num_layers=1,
                        direction=rnn_direction))
78
            else:
H
huangyuxin 已提交
79
                self.rnn.append(
H
huangyuxin 已提交
80
                    nn.LSTM(
81
                        input_size=rnn_input_size,
H
huangyuxin 已提交
82 83 84
                        hidden_size=rnn_size,
                        num_layers=1,
                        direction=rnn_direction))
85
            self.layernorm_list.append(nn.LayerNorm(layernorm_size))
86
            self.output_dim = layernorm_size
87

88
        fc_input_size = layernorm_size
H
huangyuxin 已提交
89
        for i in range(self.num_fc_layers):
H
huangyuxin 已提交
90 91
            self.fc_layers_list.append(
                nn.Linear(fc_input_size, fc_layers_size_list[i]))
H
huangyuxin 已提交
92
            fc_input_size = fc_layers_size_list[i]
93
            self.output_dim = fc_layers_size_list[i]
H
huangyuxin 已提交
94

H
huangyuxin 已提交
95 96
    @property
    def output_size(self):
97
        return self.output_dim
H
huangyuxin 已提交
98

99
    def forward(self, x, x_lens, init_state_h_box=None, init_state_c_box=None):
100 101 102
        """Compute Encoder outputs

        Args:
103
            x (Tensor): [B, feature_size, D]
104
            x_lens (Tensor): [B]
H
huangyuxin 已提交
105 106 107
            init_state_h_box(Tensor): init_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
            init_state_c_box(Tensor): init_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
        Return:
108
            x (Tensor): encoder outputs, [B, size, D]
109
            x_lens (Tensor): encoder length, [B]
H
huangyuxin 已提交
110 111
            final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
            final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
112
        """
113 114
        if init_state_h_box is not None:
            init_state_list = None
115

116
            if self.use_gru is True:
117 118
                init_state_h_list = paddle.split(
                    init_state_h_box, self.num_rnn_layers, axis=0)
119 120
                init_state_list = init_state_h_list
            else:
121 122 123 124
                init_state_h_list = paddle.split(
                    init_state_h_box, self.num_rnn_layers, axis=0)
                init_state_c_list = paddle.split(
                    init_state_c_box, self.num_rnn_layers, axis=0)
125 126 127 128 129
                init_state_list = [(init_state_h_list[i], init_state_c_list[i])
                                   for i in range(self.num_rnn_layers)]
        else:
            init_state_list = [None] * self.num_rnn_layers

130
        x, x_lens = self.conv(x, x_lens)
131
        final_chunk_state_list = []
132
        for i in range(0, self.num_rnn_layers):
133 134
            x, final_state = self.rnn[i](x, init_state_list[i],
                                         x_lens)  #[B, T, D]
135
            final_chunk_state_list.append(final_state)
136 137 138 139 140
            x = self.layernorm_list[i](x)

        for i in range(self.num_fc_layers):
            x = self.fc_layers_list[i](x)
            x = F.relu(x)
141

142
        if self.use_gru is True:
143 144
            final_chunk_state_h_box = paddle.concat(
                final_chunk_state_list, axis=0)
H
huangyuxin 已提交
145
            final_chunk_state_c_box = init_state_c_box
146 147 148 149 150 151 152 153 154 155 156 157 158
        else:
            final_chunk_state_h_list = [
                final_chunk_state_list[i][0] for i in range(self.num_rnn_layers)
            ]
            final_chunk_state_c_list = [
                final_chunk_state_list[i][1] for i in range(self.num_rnn_layers)
            ]
            final_chunk_state_h_box = paddle.concat(
                final_chunk_state_h_list, axis=0)
            final_chunk_state_c_box = paddle.concat(
                final_chunk_state_c_list, axis=0)

        return x, x_lens, final_chunk_state_h_box, final_chunk_state_c_box
159 160

    def forward_chunk_by_chunk(self, x, x_lens, decoder_chunk_size=8):
161 162 163 164 165 166 167
        """Compute Encoder outputs

        Args:
            x (Tensor): [B, T, D]
            x_lens (Tensor): [B]
            decoder_chunk_size: The chunk size of decoder
        Returns:
H
huangyuxin 已提交
168 169 170 171
            eouts_list (List of Tensor): The list of encoder outputs in chunk_size: [B, chunk_size, D] * num_chunks
            eouts_lens_list (List of Tensor): The list of  encoder length in chunk_size: [B] * num_chunks
            final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
            final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
172
        """
173 174 175 176 177 178 179 180 181 182 183 184 185
        subsampling_rate = self.conv.subsampling_rate
        receptive_field_length = self.conv.receptive_field_length
        chunk_size = (decoder_chunk_size - 1
                      ) * subsampling_rate + receptive_field_length
        chunk_stride = subsampling_rate * decoder_chunk_size
        max_len = x.shape[1]
        assert (chunk_size <= max_len)

        eouts_chunk_list = []
        eouts_chunk_lens_list = []

        padding_len = chunk_stride - (max_len - chunk_size) % chunk_stride
        padding = paddle.zeros((x.shape[0], padding_len, x.shape[2]))
186
        padded_x = paddle.concat([x, padding], axis=1)
187 188
        num_chunk = (max_len + padding_len - chunk_size) / chunk_stride + 1
        num_chunk = int(num_chunk)
189 190 191 192
        chunk_state_h_box = None
        chunk_state_c_box = None
        final_state_h_box = None
        final_state_c_box = None
193 194 195
        for i in range(0, num_chunk):
            start = i * chunk_stride
            end = start + chunk_size
196 197 198 199 200
            x_chunk = padded_x[:, start:end, :]

            x_len_left = paddle.where(x_lens - i * chunk_stride < 0,
                                      paddle.zeros_like(x_lens),
                                      x_lens - i * chunk_stride)
201 202 203 204
            x_chunk_len_tmp = paddle.ones_like(x_lens) * chunk_size
            x_chunk_lens = paddle.where(x_len_left < x_chunk_len_tmp,
                                        x_len_left, x_chunk_len_tmp)

205
            eouts_chunk, eouts_chunk_lens, chunk_state_h_box, chunk_state_c_box = self.forward(
206
                x_chunk, x_chunk_lens, chunk_state_h_box, chunk_state_c_box)
207 208 209

            eouts_chunk_list.append(eouts_chunk)
            eouts_chunk_lens_list.append(eouts_chunk_lens)
210 211
        final_state_h_box = chunk_state_h_box
        final_state_c_box = chunk_state_c_box
212
        return eouts_chunk_list, eouts_chunk_lens_list, final_state_h_box, final_state_c_box
H
huangyuxin 已提交
213 214


H
huangyuxin 已提交
215 216
class DeepSpeech2ModelOnline(nn.Layer):
    """The DeepSpeech2 network structure for online.
H
huangyuxin 已提交
217

H
huangyuxin 已提交
218 219 220 221
    :param audio: Audio spectrogram data layer.
    :type audio: Variable
    :param text: Transcription text data layer.
    :type text: Variable
H
huangyuxin 已提交
222 223
    :param audio_len: Valid sequence length data layer.
    :type audio_len: Variable
H
huangyuxin 已提交
224 225
    :param feat_size: feature size for audio.
    :type feat_size: int
H
huangyuxin 已提交
226 227 228 229 230 231 232 233
    :param dict_size: Dictionary size for tokenized transcription.
    :type dict_size: int
    :param num_conv_layers: Number of stacking convolution layers.
    :type num_conv_layers: int
    :param num_rnn_layers: Number of stacking RNN layers.
    :type num_rnn_layers: int
    :param rnn_size: RNN layer size (dimension of RNN cells).
    :type rnn_size: int
234 235 236 237
    :param num_fc_layers: Number of stacking FC layers.
    :type num_fc_layers: int
    :param fc_layers_size_list: The list of FC layer sizes.
    :type fc_layers_size_list: [int,]
H
huangyuxin 已提交
238 239 240 241
    :param use_gru: Use gru if set True. Use simple rnn if set False.
    :type use_gru: bool
    :return: A tuple of an output unnormalized log probability layer (
             before softmax) and a ctc cost layer.
H
huangyuxin 已提交
242
    :rtype: tuple of LayerOutput
H
huangyuxin 已提交
243 244 245 246 247 248 249
    """

    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                num_conv_layers=2,  #Number of stacking convolution layers.
H
huangyuxin 已提交
250
                num_rnn_layers=4,  #Number of stacking RNN layers.
H
huangyuxin 已提交
251
                rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
H
huangyuxin 已提交
252
                num_fc_layers=2,
H
huangyuxin 已提交
253
                fc_layers_size_list=[512, 256],
H
huangyuxin 已提交
254 255 256 257 258 259 260 261 262 263
                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default

    def __init__(self,
                 feat_size,
                 dict_size,
                 num_conv_layers=2,
264
                 num_rnn_layers=4,
H
huangyuxin 已提交
265
                 rnn_size=1024,
266
                 rnn_direction='forward',
H
huangyuxin 已提交
267 268
                 num_fc_layers=2,
                 fc_layers_size_list=[512, 256],
269
                 use_gru=False):
H
huangyuxin 已提交
270 271 272 273 274 275
        super().__init__()
        self.encoder = CRNNEncoder(
            feat_size=feat_size,
            dict_size=dict_size,
            num_conv_layers=num_conv_layers,
            num_rnn_layers=num_rnn_layers,
276
            rnn_direction=rnn_direction,
H
huangyuxin 已提交
277 278
            num_fc_layers=num_fc_layers,
            fc_layers_size_list=fc_layers_size_list,
H
huangyuxin 已提交
279
            rnn_size=rnn_size,
280
            use_gru=use_gru)
H
huangyuxin 已提交
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301

        self.decoder = CTCDecoder(
            odim=dict_size,  # <blank> is in  vocab
            enc_n_units=self.encoder.output_size,
            blank_id=0,  # first token is <blank>
            dropout_rate=0.0,
            reduction=True,  # sum
            batch_average=True)  # sum / batch_size

    def forward(self, audio, audio_len, text, text_len):
        """Compute Model loss

        Args:
            audio (Tenosr): [B, T, D]
            audio_len (Tensor): [B]
            text (Tensor): [B, U]
            text_len (Tensor): [B]

        Returns:
            loss (Tenosr): [1]
        """
302
        eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
303
            audio, audio_len, None, None)
H
huangyuxin 已提交
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319
        loss = self.decoder(eouts, eouts_len, text, text_len)
        return loss

    @paddle.no_grad()
    def decode(self, audio, audio_len, vocab_list, decoding_method,
               lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
               cutoff_top_n, num_processes):
        # init once
        # decoders only accept string encoded in utf-8
        self.decoder.init_decode(
            beam_alpha=beam_alpha,
            beam_beta=beam_beta,
            lang_model_path=lang_model_path,
            vocab_list=vocab_list,
            decoding_method=decoding_method)

320
        eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
321
            audio, audio_len, None, None)
322 323 324 325 326
        probs = self.decoder.softmax(eouts)
        return self.decoder.decode_probs(
            probs.numpy(), eouts_len, vocab_list, decoding_method,
            lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
            cutoff_top_n, num_processes)
327

H
huangyuxin 已提交
328 329 330 331 332 333 334 335 336
    @classmethod
    def from_pretrained(cls, dataloader, config, checkpoint_path):
        """Build a DeepSpeech2Model model from a pretrained model.
        Parameters
        ----------
        dataloader: paddle.io.DataLoader

        config: yacs.config.CfgNode
            model configs
H
huangyuxin 已提交
337

H
huangyuxin 已提交
338 339
        checkpoint_path: Path or str
            the path of pretrained model checkpoint, without extension name
H
huangyuxin 已提交
340

H
huangyuxin 已提交
341 342
        Returns
        -------
343
        DeepSpeech2ModelOnline
H
huangyuxin 已提交
344 345 346 347 348 349 350
            The model built from pretrained result.
        """
        model = cls(feat_size=dataloader.collate_fn.feature_size,
                    dict_size=dataloader.collate_fn.vocab_size,
                    num_conv_layers=config.model.num_conv_layers,
                    num_rnn_layers=config.model.num_rnn_layers,
                    rnn_size=config.model.rnn_layer_size,
351
                    rnn_direction=config.model.rnn_direction,
H
huangyuxin 已提交
352 353
                    num_fc_layers=config.model.num_fc_layers,
                    fc_layers_size_list=config.model.fc_layers_size_list,
354
                    use_gru=config.model.use_gru)
H
huangyuxin 已提交
355 356 357 358 359 360
        infos = Checkpoint().load_parameters(
            model, checkpoint_path=checkpoint_path)
        logger.info(f"checkpoint info: {infos}")
        layer_tools.summary(model)
        return model

361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
    @classmethod
    def from_config(cls, config):
        """Build a DeepSpeec2ModelOnline from config
        Parameters

        config: yacs.config.CfgNode
            config.model
        Returns
        -------
        DeepSpeech2ModelOnline
            The model built from config.
        """
        model = cls(feat_size=config.feat_size,
                    dict_size=config.dict_size,
                    num_conv_layers=config.num_conv_layers,
                    num_rnn_layers=config.num_rnn_layers,
                    rnn_size=config.rnn_layer_size,
                    rnn_direction=config.rnn_direction,
                    num_fc_layers=config.num_fc_layers,
                    fc_layers_size_list=config.fc_layers_size_list,
                    use_gru=config.use_gru)
        return model

H
huangyuxin 已提交
384

H
huangyuxin 已提交
385
class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
H
huangyuxin 已提交
386 387 388 389
    def __init__(self,
                 feat_size,
                 dict_size,
                 num_conv_layers=2,
390
                 num_rnn_layers=4,
H
huangyuxin 已提交
391
                 rnn_size=1024,
392
                 rnn_direction='forward',
H
huangyuxin 已提交
393 394
                 num_fc_layers=2,
                 fc_layers_size_list=[512, 256],
395
                 use_gru=False):
H
huangyuxin 已提交
396 397 398 399 400 401
        super().__init__(
            feat_size=feat_size,
            dict_size=dict_size,
            num_conv_layers=num_conv_layers,
            num_rnn_layers=num_rnn_layers,
            rnn_size=rnn_size,
402
            rnn_direction=rnn_direction,
H
huangyuxin 已提交
403 404
            num_fc_layers=num_fc_layers,
            fc_layers_size_list=fc_layers_size_list,
405
            use_gru=use_gru)
H
huangyuxin 已提交
406

407 408
    def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box,
                chunk_state_c_box):
409
        eouts_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box = self.encoder(
410
            audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box)
411
        probs_chunk = self.decoder.softmax(eouts_chunk)
412
        return probs_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box
413

414
    def export(self):
415
        static_model = paddle.jit.to_static(
416
            self,
417 418
            input_spec=[
                paddle.static.InputSpec(
419 420
                    shape=[None, None,
                           self.encoder.feat_size],  #[B, chunk_size, feat_dim]
421
                    dtype='float32'),
422 423 424 425 426 427 428 429
                paddle.static.InputSpec(shape=[None],
                                        dtype='int64'),  # audio_length, [B]
                paddle.static.InputSpec(
                    shape=[None, None, None], dtype='float32'),
                paddle.static.InputSpec(
                    shape=[None, None, None], dtype='float32')
            ])
        return static_model