提交 ab759b16 编写于 作者: H huangyuxin

Merge branch 'develop' of https://github.com/PaddlePaddle/DeepSpeech into CLI

...@@ -130,7 +130,7 @@ pull_request_rules: ...@@ -130,7 +130,7 @@ pull_request_rules:
add: ["Docker"] add: ["Docker"]
- name: "auto add label=Deployment" - name: "auto add label=Deployment"
conditions: conditions:
- files~=^speechnn/ - files~=^speechx/
actions: actions:
label: label:
add: ["Deployment"] add: ["Deployment"]
...@@ -12,4 +12,4 @@ ...@@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
__version__ = '0.1.0' __version__ = '0.1.1'
...@@ -627,7 +627,7 @@ class FastSpeech2(nn.Layer): ...@@ -627,7 +627,7 @@ class FastSpeech2(nn.Layer):
hs = hs + e_embs + p_embs hs = hs + e_embs + p_embs
# (B, Lmax, adim) # (B, Lmax, adim)
hs = self.length_regulator(hs, d_outs, alpha) hs = self.length_regulator(hs, d_outs, alpha, is_inference=True)
else: else:
d_outs = self.duration_predictor(hs, d_masks) d_outs = self.duration_predictor(hs, d_masks)
# use groundtruth in training # use groundtruth in training
...@@ -638,7 +638,7 @@ class FastSpeech2(nn.Layer): ...@@ -638,7 +638,7 @@ class FastSpeech2(nn.Layer):
hs = hs + e_embs + p_embs hs = hs + e_embs + p_embs
# (B, Lmax, adim) # (B, Lmax, adim)
hs = self.length_regulator(hs, ds) hs = self.length_regulator(hs, ds, is_inference=False)
# forward decoder # forward decoder
if olens is not None and not is_inference: if olens is not None and not is_inference:
......
...@@ -14,28 +14,9 @@ ...@@ -14,28 +14,9 @@
import paddle import paddle
from paddle import nn from paddle import nn
from paddlespeech.t2s.modules.nets_utils import initialize
from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor:
"""
encodings: (B, T, C)
durations: (B, T)
"""
batch_size, t_enc = paddle.shape(durations)
slens = paddle.sum(durations, -1)
t_dec = paddle.max(slens)
M = paddle.zeros([batch_size, t_dec, t_enc])
for i in range(batch_size):
k = 0
for j in range(t_enc):
d = durations[i, j]
# If the d == 0, slice action is meaningless and not supported
if d >= 1:
M[0, k:k + d, j] = 1
k += d
encodings = paddle.matmul(M, encodings)
return encodings
class ResidualBlock(nn.Layer): class ResidualBlock(nn.Layer):
...@@ -175,19 +156,25 @@ class SpeedySpeechDecoder(nn.Layer): ...@@ -175,19 +156,25 @@ class SpeedySpeechDecoder(nn.Layer):
class SpeedySpeech(nn.Layer): class SpeedySpeech(nn.Layer):
def __init__(self, def __init__(
vocab_size, self,
encoder_hidden_size, vocab_size,
encoder_kernel_size, encoder_hidden_size,
encoder_dilations, encoder_kernel_size,
duration_predictor_hidden_size, encoder_dilations,
decoder_hidden_size, duration_predictor_hidden_size,
decoder_output_size, decoder_hidden_size,
decoder_kernel_size, decoder_output_size,
decoder_dilations, decoder_kernel_size,
tone_size=None, decoder_dilations,
spk_num=None): tone_size=None,
spk_num=None,
init_type: str="xavier_uniform", ):
super().__init__() super().__init__()
# initialize parameters
initialize(self, init_type)
encoder = SpeedySpeechEncoder(vocab_size, tone_size, encoder = SpeedySpeechEncoder(vocab_size, tone_size,
encoder_hidden_size, encoder_kernel_size, encoder_hidden_size, encoder_kernel_size,
encoder_dilations, spk_num) encoder_dilations, spk_num)
...@@ -198,6 +185,10 @@ class SpeedySpeech(nn.Layer): ...@@ -198,6 +185,10 @@ class SpeedySpeech(nn.Layer):
self.encoder = encoder self.encoder = encoder
self.duration_predictor = duration_predictor self.duration_predictor = duration_predictor
self.decoder = decoder self.decoder = decoder
# define length regulator
self.length_regulator = LengthRegulator()
nn.initializer.set_global_initializer(None)
def forward(self, text, tones, durations, spk_id: paddle.Tensor=None): def forward(self, text, tones, durations, spk_id: paddle.Tensor=None):
# input of embedding must be int64 # input of embedding must be int64
...@@ -212,7 +203,7 @@ class SpeedySpeech(nn.Layer): ...@@ -212,7 +203,7 @@ class SpeedySpeech(nn.Layer):
# expand encodings # expand encodings
durations_to_expand = durations durations_to_expand = durations
encodings = expand(encodings, durations_to_expand) encodings = self.length_regulator(encodings, durations_to_expand)
# decode # decode
# remove positional encoding here # remove positional encoding here
...@@ -240,7 +231,8 @@ class SpeedySpeech(nn.Layer): ...@@ -240,7 +231,8 @@ class SpeedySpeech(nn.Layer):
durations_to_expand = durations_to_expand.astype(paddle.int64) durations_to_expand = durations_to_expand.astype(paddle.int64)
else: else:
durations_to_expand = durations durations_to_expand = durations
encodings = expand(encodings, durations_to_expand) encodings = self.length_regulator(
encodings, durations_to_expand, is_inference=True)
shape = paddle.shape(encodings) shape = paddle.shape(encodings)
t_dec, feature_size = shape[1], shape[2] t_dec, feature_size = shape[1], shape[2]
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet) # Modified from espnet(https://github.com/espnet/espnet)
"""Length regulator related modules.""" """Length regulator related modules."""
import numpy as np
import paddle import paddle
from paddle import nn from paddle import nn
...@@ -43,6 +44,28 @@ class LengthRegulator(nn.Layer): ...@@ -43,6 +44,28 @@ class LengthRegulator(nn.Layer):
super().__init__() super().__init__()
self.pad_value = pad_value self.pad_value = pad_value
# expand_numpy is faster than expand
def expand_numpy(self, encodings: paddle.Tensor,
durations: paddle.Tensor) -> paddle.Tensor:
"""
encodings: (B, T, C)
durations: (B, T)
"""
batch_size, t_enc = durations.shape
durations = durations.numpy()
slens = np.sum(durations, -1)
t_dec = np.max(slens)
M = np.zeros([batch_size, t_dec, t_enc])
for i in range(batch_size):
k = 0
for j in range(t_enc):
d = durations[i, j]
M[i, k:k + d, j] = 1
k += d
M = paddle.to_tensor(M, dtype=encodings.dtype)
encodings = paddle.matmul(M, encodings)
return encodings
def expand(self, encodings: paddle.Tensor, def expand(self, encodings: paddle.Tensor,
durations: paddle.Tensor) -> paddle.Tensor: durations: paddle.Tensor) -> paddle.Tensor:
""" """
...@@ -50,20 +73,21 @@ class LengthRegulator(nn.Layer): ...@@ -50,20 +73,21 @@ class LengthRegulator(nn.Layer):
durations: (B, T) durations: (B, T)
""" """
batch_size, t_enc = paddle.shape(durations) batch_size, t_enc = paddle.shape(durations)
slens = durations.sum(-1) slens = paddle.sum(durations, -1)
t_dec = slens.max() t_dec = paddle.max(slens)
M = paddle.zeros([batch_size, t_dec, t_enc]) M = paddle.zeros([batch_size, t_dec, t_enc])
for i in range(batch_size): for i in range(batch_size):
k = 0 k = 0
for j in range(t_enc): for j in range(t_enc):
d = durations[i, j] d = durations[i, j]
# If the d == 0, slice action is meaningless and not supported in paddle
if d >= 1: if d >= 1:
M[i, k:k + d, j] = 1 M[i, k:k + d, j] = 1
k += d k += d
encodings = paddle.matmul(M, encodings) encodings = paddle.matmul(M, encodings)
return encodings return encodings
def forward(self, xs, ds, alpha=1.0): def forward(self, xs, ds, alpha=1.0, is_inference=False):
"""Calculate forward propagation. """Calculate forward propagation.
Parameters Parameters
...@@ -85,4 +109,7 @@ class LengthRegulator(nn.Layer): ...@@ -85,4 +109,7 @@ class LengthRegulator(nn.Layer):
assert alpha > 0 assert alpha > 0
ds = paddle.round(ds.cast(dtype=paddle.float32) * alpha) ds = paddle.round(ds.cast(dtype=paddle.float32) * alpha)
ds = ds.cast(dtype=paddle.int64) ds = ds.cast(dtype=paddle.int64)
return self.expand(xs, ds) if is_inference:
return self.expand(xs, ds)
else:
return self.expand_numpy(xs, ds)
...@@ -17,6 +17,7 @@ import io ...@@ -17,6 +17,7 @@ import io
import os import os
import subprocess as sp import subprocess as sp
import sys import sys
import paddlespeech
from pathlib import Path from pathlib import Path
from setuptools import Command from setuptools import Command
...@@ -172,7 +173,7 @@ class UploadCommand(Command): ...@@ -172,7 +173,7 @@ class UploadCommand(Command):
setup_info = dict( setup_info = dict(
# Metadata # Metadata
name='paddlespeech', name='paddlespeech',
version='0.1.1', version=paddlespeech.__version__,
author='PaddlePaddle Speech and Language Team', author='PaddlePaddle Speech and Language Team',
author_email='paddlesl@baidu.com', author_email='paddlesl@baidu.com',
url='https://github.com/PaddlePaddle/PaddleSpeech', url='https://github.com/PaddlePaddle/PaddleSpeech',
......
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(deepspeech VERSION 0.1)
set(CMAKE_VERBOSE_MAKEFILE on)
# set std-14
set(CMAKE_CXX_STANDARD 14)
# include file
include(FetchContent)
include(ExternalProject)
# fc_patch dir
set(FETCHCONTENT_QUIET off)
get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
set(FETCHCONTENT_BASE_DIR ${fc_patch})
###############################################################################
# Option Configurations
###############################################################################
# option configurations
option(TEST_DEBUG "option for debug" OFF)
###############################################################################
# Include third party
###############################################################################
# #example for include third party
# FetchContent_Declare()
# # FetchContent_MakeAvailable was not added until CMake 3.14
# FetchContent_MakeAvailable()
# include_directories()
# ABSEIL-CPP
include(FetchContent)
FetchContent_Declare(
absl
GIT_REPOSITORY "https://github.com/abseil/abseil-cpp.git"
GIT_TAG "20210324.1"
)
FetchContent_MakeAvailable(absl)
# libsndfile
include(FetchContent)
FetchContent_Declare(
libsndfile
GIT_REPOSITORY "https://github.com/libsndfile/libsndfile.git"
GIT_TAG "1.0.31"
)
FetchContent_MakeAvailable(libsndfile)
###############################################################################
# Add local library
###############################################################################
# system lib
find_package()
# if dir have CmakeLists.txt
add_subdirectory()
# if dir do not have CmakeLists.txt
add_library(lib_name STATIC file.cc)
target_link_libraries(lib_name item0 item1)
add_dependencies(lib_name depend-target)
###############################################################################
# Library installation
###############################################################################
install()
###############################################################################
# Build binary file
###############################################################################
add_executable()
target_link_libraries()
aux_source_directory(. DIR_LIB_SRCS)
add_library(decoder STATIC ${DIR_LIB_SRCS})
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册