提交 ab759b16 编写于 作者: H huangyuxin

Merge branch 'develop' of https://github.com/PaddlePaddle/DeepSpeech into CLI

......@@ -130,7 +130,7 @@ pull_request_rules:
add: ["Docker"]
- name: "auto add label=Deployment"
conditions:
- files~=^speechnn/
- files~=^speechx/
actions:
label:
add: ["Deployment"]
......@@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = '0.1.0'
__version__ = '0.1.1'
......@@ -627,7 +627,7 @@ class FastSpeech2(nn.Layer):
hs = hs + e_embs + p_embs
# (B, Lmax, adim)
hs = self.length_regulator(hs, d_outs, alpha)
hs = self.length_regulator(hs, d_outs, alpha, is_inference=True)
else:
d_outs = self.duration_predictor(hs, d_masks)
# use groundtruth in training
......@@ -638,7 +638,7 @@ class FastSpeech2(nn.Layer):
hs = hs + e_embs + p_embs
# (B, Lmax, adim)
hs = self.length_regulator(hs, ds)
hs = self.length_regulator(hs, ds, is_inference=False)
# forward decoder
if olens is not None and not is_inference:
......
......@@ -14,28 +14,9 @@
import paddle
from paddle import nn
from paddlespeech.t2s.modules.nets_utils import initialize
from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor:
"""
encodings: (B, T, C)
durations: (B, T)
"""
batch_size, t_enc = paddle.shape(durations)
slens = paddle.sum(durations, -1)
t_dec = paddle.max(slens)
M = paddle.zeros([batch_size, t_dec, t_enc])
for i in range(batch_size):
k = 0
for j in range(t_enc):
d = durations[i, j]
# If the d == 0, slice action is meaningless and not supported
if d >= 1:
M[0, k:k + d, j] = 1
k += d
encodings = paddle.matmul(M, encodings)
return encodings
from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
class ResidualBlock(nn.Layer):
......@@ -175,7 +156,8 @@ class SpeedySpeechDecoder(nn.Layer):
class SpeedySpeech(nn.Layer):
def __init__(self,
def __init__(
self,
vocab_size,
encoder_hidden_size,
encoder_kernel_size,
......@@ -186,8 +168,13 @@ class SpeedySpeech(nn.Layer):
decoder_kernel_size,
decoder_dilations,
tone_size=None,
spk_num=None):
spk_num=None,
init_type: str="xavier_uniform", ):
super().__init__()
# initialize parameters
initialize(self, init_type)
encoder = SpeedySpeechEncoder(vocab_size, tone_size,
encoder_hidden_size, encoder_kernel_size,
encoder_dilations, spk_num)
......@@ -198,6 +185,10 @@ class SpeedySpeech(nn.Layer):
self.encoder = encoder
self.duration_predictor = duration_predictor
self.decoder = decoder
# define length regulator
self.length_regulator = LengthRegulator()
nn.initializer.set_global_initializer(None)
def forward(self, text, tones, durations, spk_id: paddle.Tensor=None):
# input of embedding must be int64
......@@ -212,7 +203,7 @@ class SpeedySpeech(nn.Layer):
# expand encodings
durations_to_expand = durations
encodings = expand(encodings, durations_to_expand)
encodings = self.length_regulator(encodings, durations_to_expand)
# decode
# remove positional encoding here
......@@ -240,7 +231,8 @@ class SpeedySpeech(nn.Layer):
durations_to_expand = durations_to_expand.astype(paddle.int64)
else:
durations_to_expand = durations
encodings = expand(encodings, durations_to_expand)
encodings = self.length_regulator(
encodings, durations_to_expand, is_inference=True)
shape = paddle.shape(encodings)
t_dec, feature_size = shape[1], shape[2]
......
......@@ -13,6 +13,7 @@
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
"""Length regulator related modules."""
import numpy as np
import paddle
from paddle import nn
......@@ -43,6 +44,28 @@ class LengthRegulator(nn.Layer):
super().__init__()
self.pad_value = pad_value
# expand_numpy is faster than expand
def expand_numpy(self, encodings: paddle.Tensor,
durations: paddle.Tensor) -> paddle.Tensor:
"""
encodings: (B, T, C)
durations: (B, T)
"""
batch_size, t_enc = durations.shape
durations = durations.numpy()
slens = np.sum(durations, -1)
t_dec = np.max(slens)
M = np.zeros([batch_size, t_dec, t_enc])
for i in range(batch_size):
k = 0
for j in range(t_enc):
d = durations[i, j]
M[i, k:k + d, j] = 1
k += d
M = paddle.to_tensor(M, dtype=encodings.dtype)
encodings = paddle.matmul(M, encodings)
return encodings
def expand(self, encodings: paddle.Tensor,
durations: paddle.Tensor) -> paddle.Tensor:
"""
......@@ -50,20 +73,21 @@ class LengthRegulator(nn.Layer):
durations: (B, T)
"""
batch_size, t_enc = paddle.shape(durations)
slens = durations.sum(-1)
t_dec = slens.max()
slens = paddle.sum(durations, -1)
t_dec = paddle.max(slens)
M = paddle.zeros([batch_size, t_dec, t_enc])
for i in range(batch_size):
k = 0
for j in range(t_enc):
d = durations[i, j]
# If the d == 0, slice action is meaningless and not supported in paddle
if d >= 1:
M[i, k:k + d, j] = 1
k += d
encodings = paddle.matmul(M, encodings)
return encodings
def forward(self, xs, ds, alpha=1.0):
def forward(self, xs, ds, alpha=1.0, is_inference=False):
"""Calculate forward propagation.
Parameters
......@@ -85,4 +109,7 @@ class LengthRegulator(nn.Layer):
assert alpha > 0
ds = paddle.round(ds.cast(dtype=paddle.float32) * alpha)
ds = ds.cast(dtype=paddle.int64)
if is_inference:
return self.expand(xs, ds)
else:
return self.expand_numpy(xs, ds)
......@@ -17,6 +17,7 @@ import io
import os
import subprocess as sp
import sys
import paddlespeech
from pathlib import Path
from setuptools import Command
......@@ -172,7 +173,7 @@ class UploadCommand(Command):
setup_info = dict(
# Metadata
name='paddlespeech',
version='0.1.1',
version=paddlespeech.__version__,
author='PaddlePaddle Speech and Language Team',
author_email='paddlesl@baidu.com',
url='https://github.com/PaddlePaddle/PaddleSpeech',
......
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(deepspeech VERSION 0.1)
set(CMAKE_VERBOSE_MAKEFILE on)
# set std-14
set(CMAKE_CXX_STANDARD 14)
# include file
include(FetchContent)
include(ExternalProject)
# fc_patch dir
set(FETCHCONTENT_QUIET off)
get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
set(FETCHCONTENT_BASE_DIR ${fc_patch})
###############################################################################
# Option Configurations
###############################################################################
# option configurations
option(TEST_DEBUG "option for debug" OFF)
###############################################################################
# Include third party
###############################################################################
# #example for include third party
# FetchContent_Declare()
# # FetchContent_MakeAvailable was not added until CMake 3.14
# FetchContent_MakeAvailable()
# include_directories()
# ABSEIL-CPP
include(FetchContent)
FetchContent_Declare(
absl
GIT_REPOSITORY "https://github.com/abseil/abseil-cpp.git"
GIT_TAG "20210324.1"
)
FetchContent_MakeAvailable(absl)
# libsndfile
include(FetchContent)
FetchContent_Declare(
libsndfile
GIT_REPOSITORY "https://github.com/libsndfile/libsndfile.git"
GIT_TAG "1.0.31"
)
FetchContent_MakeAvailable(libsndfile)
###############################################################################
# Add local library
###############################################################################
# system lib
find_package()
# if dir have CmakeLists.txt
add_subdirectory()
# if dir do not have CmakeLists.txt
add_library(lib_name STATIC file.cc)
target_link_libraries(lib_name item0 item1)
add_dependencies(lib_name depend-target)
###############################################################################
# Library installation
###############################################################################
install()
###############################################################################
# Build binary file
###############################################################################
add_executable()
target_link_libraries()
aux_source_directory(. DIR_LIB_SRCS)
add_library(decoder STATIC ${DIR_LIB_SRCS})
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册