From 521e222db8eab16754d4e7f9985924a317add2fd Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Fri, 11 Mar 2022 17:35:51 +0800 Subject: [PATCH] Add mdtc model. --- paddlespeech/kws/__init__.py | 13 ++ paddlespeech/kws/models/__init__.py | 13 ++ paddlespeech/kws/models/mdtc.py | 218 ++++++++++++++++++++++++++++ 3 files changed, 244 insertions(+) create mode 100644 paddlespeech/kws/__init__.py create mode 100644 paddlespeech/kws/models/__init__.py create mode 100644 paddlespeech/kws/models/mdtc.py diff --git a/paddlespeech/kws/__init__.py b/paddlespeech/kws/__init__.py new file mode 100644 index 00000000..97043fd7 --- /dev/null +++ b/paddlespeech/kws/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/kws/models/__init__.py b/paddlespeech/kws/models/__init__.py new file mode 100644 index 00000000..97043fd7 --- /dev/null +++ b/paddlespeech/kws/models/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/kws/models/mdtc.py b/paddlespeech/kws/models/mdtc.py new file mode 100644 index 00000000..25b79baf --- /dev/null +++ b/paddlespeech/kws/models/mdtc.py @@ -0,0 +1,218 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class DSDilatedConv1d(nn.Layer): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + dilation: int=1, + stride: int=1, + bias: bool=True, ): + super(DSDilatedConv1d, self).__init__() + self.receptive_fields = dilation * (kernel_size - 1) + self.conv = nn.Conv1D( + in_channels, + in_channels, + kernel_size, + padding=0, + dilation=dilation, + stride=stride, + groups=in_channels, + bias_attr=bias, ) + self.bn = nn.BatchNorm1D(in_channels) + self.pointwise = nn.Conv1D( + in_channels, + out_channels, + kernel_size=1, + padding=0, + dilation=1, + bias_attr=bias) + + def forward(self, inputs: paddle.Tensor): + outputs = self.conv(inputs) + outputs = self.bn(outputs) + outputs = self.pointwise(outputs) + return outputs + + +class TCNBlock(nn.Layer): + def __init__( + self, + in_channels: int, + res_channels: int, + kernel_size: int, + dilation: int, + causal: bool, ): + super(TCNBlock, self).__init__() + self.in_channels = in_channels + self.res_channels = res_channels + self.kernel_size = kernel_size + self.dilation = dilation + self.causal = causal + self.receptive_fields = dilation * (kernel_size - 1) + self.half_receptive_fields = self.receptive_fields // 2 + self.conv1 = DSDilatedConv1d( + in_channels=in_channels, + out_channels=res_channels, + kernel_size=kernel_size, + dilation=dilation, ) + self.bn1 = nn.BatchNorm1D(res_channels) + self.relu1 = nn.ReLU() + + self.conv2 = nn.Conv1D( + in_channels=res_channels, out_channels=res_channels, kernel_size=1) + self.bn2 = nn.BatchNorm1D(res_channels) + self.relu2 = nn.ReLU() + + def forward(self, inputs: paddle.Tensor): + outputs = self.relu1(self.bn1(self.conv1(inputs))) + outputs = self.bn2(self.conv2(outputs)) + if self.causal: + inputs = inputs[:, :, self.receptive_fields:] + else: + inputs = inputs[:, :, self.half_receptive_fields: + -self.half_receptive_fields] + if self.in_channels == self.res_channels: + res_out = self.relu2(outputs + inputs) + else: + res_out = self.relu2(outputs) + return res_out + + +class TCNStack(nn.Layer): + def __init__( + self, + in_channels: int, + stack_num: int, + stack_size: int, + res_channels: int, + kernel_size: int, + causal: bool, ): + super(TCNStack, self).__init__() + self.in_channels = in_channels + self.stack_num = stack_num + self.stack_size = stack_size + self.res_channels = res_channels + self.kernel_size = kernel_size + self.causal = causal + self.res_blocks = self.stack_tcn_blocks() + self.receptive_fields = self.calculate_receptive_fields() + self.res_blocks = nn.Sequential(*self.res_blocks) + + def calculate_receptive_fields(self): + receptive_fields = 0 + for block in self.res_blocks: + receptive_fields += block.receptive_fields + return receptive_fields + + def build_dilations(self): + dilations = [] + for s in range(0, self.stack_size): + for l in range(0, self.stack_num): + dilations.append(2**l) + return dilations + + def stack_tcn_blocks(self): + dilations = self.build_dilations() + res_blocks = nn.LayerList() + + res_blocks.append( + TCNBlock( + self.in_channels, + self.res_channels, + self.kernel_size, + dilations[0], + self.causal, )) + for dilation in dilations[1:]: + res_blocks.append( + TCNBlock( + self.res_channels, + self.res_channels, + self.kernel_size, + dilation, + self.causal, )) + return res_blocks + + def forward(self, inputs: paddle.Tensor): + outputs = self.res_blocks(inputs) + return outputs + + +class MDTC(nn.Layer): + def __init__( + self, + stack_num: int, + stack_size: int, + in_channels: int, + res_channels: int, + kernel_size: int, + causal: bool, ): + super(MDTC, self).__init__() + assert kernel_size % 2 == 1 + self.kernel_size = kernel_size + self.causal = causal + self.preprocessor = TCNBlock( + in_channels, res_channels, kernel_size, dilation=1, causal=causal) + self.relu = nn.ReLU() + self.blocks = nn.LayerList() + self.receptive_fields = self.preprocessor.receptive_fields + for i in range(stack_num): + self.blocks.append( + TCNStack(res_channels, stack_size, 1, res_channels, kernel_size, + causal)) + self.receptive_fields += self.blocks[-1].receptive_fields + self.half_receptive_fields = self.receptive_fields // 2 + + def forward(self, x: paddle.Tensor): + if self.causal: + outputs = F.pad(x, (0, 0, self.receptive_fields, 0, 0, 0), + 'constant') + else: + outputs = F.pad( + x, + (0, 0, self.half_receptive_fields, self.half_receptive_fields, + 0, 0), + 'constant', ) + outputs = outputs.transpose([0, 2, 1]) + outputs_list = [] + outputs = self.relu(self.preprocessor(outputs)) + for block in self.blocks: + outputs = block(outputs) + outputs_list.append(outputs) + + normalized_outputs = [] + output_size = outputs_list[-1].shape[-1] + for x in outputs_list: + remove_length = x.shape[-1] - output_size + if self.causal and remove_length > 0: + normalized_outputs.append(x[:, :, remove_length:]) + elif not self.causal and remove_length > 1: + half_remove_length = remove_length // 2 + normalized_outputs.append( + x[:, :, half_remove_length:-half_remove_length]) + else: + normalized_outputs.append(x) + + outputs = paddle.zeros_like( + outputs_list[-1], dtype=outputs_list[-1].dtype) + for x in normalized_outputs: + outputs += x + outputs = outputs.transpose([0, 2, 1]) + return outputs, None -- GitLab