Merge branch 'develop' of https://github.com/PaddlePaddle/DeepSpeech into CLI

ab759b16 · huangyuxin · 38edfd1a · d368d57d · ab759b16 · ab759b16
19 changed file
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -130,7 +130,7 @@ pull_request_rules:
        add: ["Docker"]
  - name: "auto add label=Deployment"
    conditions:
-      - files~=^speechnn/
+      - files~=^speechx/
    actions:
      label:
        add: ["Deployment"]
--- a/paddlespeech/__init__.py
+++ b/paddlespeech/__init__.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = '0.1.0'
+__version__ = '0.1.1'
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -627,7 +627,7 @@ class FastSpeech2(nn.Layer):
            hs = hs + e_embs + p_embs
            # (B, Lmax, adim)
-            hs = self.length_regulator(hs, d_outs, alpha)
+            hs = self.length_regulator(hs, d_outs, alpha, is_inference=True)
        else:
            d_outs = self.duration_predictor(hs, d_masks)
            # use groundtruth in training
@@ -638,7 +638,7 @@ class FastSpeech2(nn.Layer):
            hs = hs + e_embs + p_embs
            # (B, Lmax, adim)
-            hs = self.length_regulator(hs, ds)
+            hs = self.length_regulator(hs, ds, is_inference=False)
        # forward decoder
        if olens is not None and not is_inference:

--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@@ -14,28 +14,9 @@
 import paddle
 from paddle import nn
+from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
+from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
-def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor:
-    """
-    encodings: (B, T, C)
-    durations: (B, T)
-    """
-    batch_size, t_enc = paddle.shape(durations)
-    slens = paddle.sum(durations, -1)
-    t_dec = paddle.max(slens)
-    M = paddle.zeros([batch_size, t_dec, t_enc])
-    for i in range(batch_size):
-        k = 0
-        for j in range(t_enc):
-            d = durations[i, j]
-            # If the d == 0, slice action is meaningless and not supported
-            if d >= 1:
-                M[0, k:k + d, j] = 1
-            k += d
-    encodings = paddle.matmul(M, encodings)
-    return encodings
 class ResidualBlock(nn.Layer):
@@ -175,19 +156,25 @@ class SpeedySpeechDecoder(nn.Layer):
 class SpeedySpeech(nn.Layer):
-    def __init__(self,
+    def __init__(
-                 vocab_size,
+            self,
-                 encoder_hidden_size,
+            vocab_size,
-                 encoder_kernel_size,
+            encoder_hidden_size,
-                 encoder_dilations,
+            encoder_kernel_size,
-                 duration_predictor_hidden_size,
+            encoder_dilations,
-                 decoder_hidden_size,
+            duration_predictor_hidden_size,
-                 decoder_output_size,
+            decoder_hidden_size,
-                 decoder_kernel_size,
+            decoder_output_size,
-                 decoder_dilations,
+            decoder_kernel_size,
-                 tone_size=None,
+            decoder_dilations,
-                 spk_num=None):
+            tone_size=None,
+            spk_num=None,
+            init_type: str="xavier_uniform", ):
        super().__init__()
+        # initialize parameters
+        initialize(self, init_type)
        encoder = SpeedySpeechEncoder(vocab_size, tone_size,
                                      encoder_hidden_size, encoder_kernel_size,
                                      encoder_dilations, spk_num)
@@ -198,6 +185,10 @@ class SpeedySpeech(nn.Layer):
        self.encoder = encoder
        self.duration_predictor = duration_predictor
        self.decoder = decoder
+        # define length regulator
+        self.length_regulator = LengthRegulator()
+        nn.initializer.set_global_initializer(None)
    def forward(self, text, tones, durations, spk_id: paddle.Tensor=None):
        # input of embedding must be int64
@@ -212,7 +203,7 @@ class SpeedySpeech(nn.Layer):
        # expand encodings
        durations_to_expand = durations
-        encodings = expand(encodings, durations_to_expand)
+        encodings = self.length_regulator(encodings, durations_to_expand)
        # decode
        # remove positional encoding here
@@ -240,7 +231,8 @@ class SpeedySpeech(nn.Layer):
            durations_to_expand = durations_to_expand.astype(paddle.int64)
        else:
            durations_to_expand = durations
-        encodings = expand(encodings, durations_to_expand)
+        encodings = self.length_regulator(
+            encodings, durations_to_expand, is_inference=True)
        shape = paddle.shape(encodings)
        t_dec, feature_size = shape[1], shape[2]

--- a/paddlespeech/t2s/modules/predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/predictor/length_regulator.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
 """Length regulator related modules."""
+import numpy as np
 import paddle
 from paddle import nn
@@ -43,6 +44,28 @@ class LengthRegulator(nn.Layer):
        super().__init__()
        self.pad_value = pad_value
+    # expand_numpy is faster than expand
+    def expand_numpy(self, encodings: paddle.Tensor,
+                     durations: paddle.Tensor) -> paddle.Tensor:
+        """
+        encodings: (B, T, C)
+        durations: (B, T)
+        """
+        batch_size, t_enc = durations.shape
+        durations = durations.numpy()
+        slens = np.sum(durations, -1)
+        t_dec = np.max(slens)
+        M = np.zeros([batch_size, t_dec, t_enc])
+        for i in range(batch_size):
+            k = 0
+            for j in range(t_enc):
+                d = durations[i, j]
+                M[i, k:k + d, j] = 1
+                k += d
+        M = paddle.to_tensor(M, dtype=encodings.dtype)
+        encodings = paddle.matmul(M, encodings)
+        return encodings
    def expand(self, encodings: paddle.Tensor,
               durations: paddle.Tensor) -> paddle.Tensor:
        """
@@ -50,20 +73,21 @@ class LengthRegulator(nn.Layer):
        durations: (B, T)
        """
        batch_size, t_enc = paddle.shape(durations)
-        slens = durations.sum(-1)
+        slens = paddle.sum(durations, -1)
-        t_dec = slens.max()
+        t_dec = paddle.max(slens)
        M = paddle.zeros([batch_size, t_dec, t_enc])
        for i in range(batch_size):
            k = 0
            for j in range(t_enc):
                d = durations[i, j]
+                # If the d == 0, slice action is meaningless and not supported in paddle
                if d >= 1:
                    M[i, k:k + d, j] = 1
                k += d
        encodings = paddle.matmul(M, encodings)
        return encodings
-    def forward(self, xs, ds, alpha=1.0):
+    def forward(self, xs, ds, alpha=1.0, is_inference=False):
        """Calculate forward propagation.
        Parameters
@@ -85,4 +109,7 @@ class LengthRegulator(nn.Layer):
            assert alpha > 0
            ds = paddle.round(ds.cast(dtype=paddle.float32) * alpha)
        ds = ds.cast(dtype=paddle.int64)
-        return self.expand(xs, ds)
+        if is_inference:
+            return self.expand(xs, ds)
+        else:
+            return self.expand_numpy(xs, ds)
--- a/setup.py
+++ b/setup.py
@@ -17,6 +17,7 @@ import io
 import os
 import subprocess as sp
 import sys
+import paddlespeech
 from pathlib import Path
 from setuptools import Command
@@ -172,7 +173,7 @@ class UploadCommand(Command):
 setup_info = dict(
    # Metadata
    name='paddlespeech',
-    version='0.1.1',
+    version=paddlespeech.__version__,
    author='PaddlePaddle Speech and Language Team',
    author_email='paddlesl@baidu.com',
    url='https://github.com/PaddlePaddle/PaddleSpeech',

--- a/speechx/CMakeLists.txt
+++ b/speechx/CMakeLists.txt
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+project(deepspeech VERSION 0.1)
+set(CMAKE_VERBOSE_MAKEFILE on)
+# set std-14
+set(CMAKE_CXX_STANDARD 14)
+# include file 
+include(FetchContent)
+include(ExternalProject)
+# fc_patch dir
+set(FETCHCONTENT_QUIET off)
+get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
+set(FETCHCONTENT_BASE_DIR ${fc_patch})
+###############################################################################
+# Option Configurations
+###############################################################################
+# option configurations 
+option(TEST_DEBUG "option for debug" OFF)
+###############################################################################
+# Include third party
+###############################################################################
+# #example for include third party
+# FetchContent_Declare()
+# # FetchContent_MakeAvailable was not added until CMake 3.14
+# FetchContent_MakeAvailable()
+# include_directories()
+# ABSEIL-CPP
+include(FetchContent)
+FetchContent_Declare(
+  absl
+  GIT_REPOSITORY "https://github.com/abseil/abseil-cpp.git"
+  GIT_TAG "20210324.1"
+)
+FetchContent_MakeAvailable(absl)
+# libsndfile
+include(FetchContent)
+FetchContent_Declare(
+  libsndfile
+  GIT_REPOSITORY "https://github.com/libsndfile/libsndfile.git"
+  GIT_TAG "1.0.31"
+)
+FetchContent_MakeAvailable(libsndfile)
+###############################################################################
+# Add local library
+###############################################################################
+# system lib 
+find_package()
+# if dir have CmakeLists.txt 
+add_subdirectory()
+# if dir do not have CmakeLists.txt 
+add_library(lib_name STATIC file.cc)
+target_link_libraries(lib_name item0 item1)
+add_dependencies(lib_name depend-target)
+###############################################################################
+# Library installation
+###############################################################################
+install()
+###############################################################################
+# Build binary file
+###############################################################################
+add_executable()
+target_link_libraries()
--- a/speechx/docker/.gitkeep
+++ b/speechx/docker/.gitkeep
--- a/speechx/examples/.gitkeep
+++ b/speechx/examples/.gitkeep
--- a/speechx/speechx/CMakeLists.txt
+++ b/speechx/speechx/CMakeLists.txt
--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
+aux_source_directory(. DIR_LIB_SRCS)
+add_library(decoder STATIC ${DIR_LIB_SRCS})
--- a/speechx/speechx/frontend/CMakeLists.txt
+++ b/speechx/speechx/frontend/CMakeLists.txt
--- a/speechx/speechx/frontend/audio/CMakeLists.txt
+++ b/speechx/speechx/frontend/audio/CMakeLists.txt
--- a/speechx/speechx/frontend/text/CMakeLists.txt
+++ b/speechx/speechx/frontend/text/CMakeLists.txt
--- a/speechx/speechx/kaldi/.gitkeep
+++ b/speechx/speechx/kaldi/.gitkeep
--- a/speechx/speechx/model/CMakeLists.txt
+++ b/speechx/speechx/model/CMakeLists.txt
--- a/speechx/speechx/protocol/CMakeLists.txt
+++ b/speechx/speechx/protocol/CMakeLists.txt
--- a/speechx/speechx/third_party/CMakeLists.txt
+++ b/speechx/speechx/third_party/CMakeLists.txt
--- a/speechx/speechx/utils/CMakeLists.txt
+++ b/speechx/speechx/utils/CMakeLists.txt