add tts cli

2189b460 · 小湉湉 · 24beeca5 · 2189b460 · 2189b460 · 2189b460
33 changed file
--- a/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/dataset/aidatatang_200zh/aidatatang_200zh.py
@@ -25,7 +25,6 @@ import os
 from pathlib import Path
 import soundfile
 from utils.utility import download
 from utils.utility import unpack

--- a/dataset/aishell/aishell.py
+++ b/dataset/aishell/aishell.py
@@ -25,7 +25,6 @@ import os
 from pathlib import Path
 import soundfile
 from utils.utility import download
 from utils.utility import unpack

--- a/dataset/librispeech/librispeech.py
+++ b/dataset/librispeech/librispeech.py
@@ -27,7 +27,6 @@ import os
 from multiprocessing.pool import Pool
 import soundfile
 from utils.utility import download
 from utils.utility import unpack

--- a/dataset/mini_librispeech/mini_librispeech.py
+++ b/dataset/mini_librispeech/mini_librispeech.py
@@ -26,7 +26,6 @@ import os
 from multiprocessing.pool import Pool
 import soundfile
 from utils.utility import download
 from utils.utility import unpack

--- a/dataset/musan/musan.py
+++ b/dataset/musan/musan.py
@@ -28,7 +28,6 @@ import json
 import os
 import soundfile
 from utils.utility import download
 from utils.utility import unpack

--- a/dataset/rir_noise/rir_noise.py
+++ b/dataset/rir_noise/rir_noise.py
@@ -28,7 +28,6 @@ import json
 import os
 import soundfile
 from utils.utility import download
 from utils.utility import unzip

--- a/dataset/thchs30/thchs30.py
+++ b/dataset/thchs30/thchs30.py
@@ -26,7 +26,6 @@ from multiprocessing.pool import Pool
 from pathlib import Path
 import soundfile
 from utils.utility import download
 from utils.utility import unpack

--- a/dataset/timit/timit.py
+++ b/dataset/timit/timit.py
@@ -27,7 +27,6 @@ import string
 from pathlib import Path
 import soundfile
 from utils.utility import unzip
 URL_ROOT = ""

--- a/dataset/voxforge/voxforge.py
+++ b/dataset/voxforge/voxforge.py
@@ -27,7 +27,6 @@ import shutil
 import subprocess
 import soundfile
 from utils.utility import download_multi
 from utils.utility import getfile_insensitive
 from utils.utility import unpack

--- a/paddlespeech/cli/__init__.py
+++ b/paddlespeech/cli/__init__.py
@@ -16,3 +16,4 @@ from .base_commands import BaseCommand
 from .base_commands import HelpCommand
 from .cls import CLSExecutor
 from .st import STExecutor
+from .tts import TTSExecutor
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -119,7 +119,7 @@ class ASRExecutor(BaseExecutor):
    def _get_pretrained_path(self, tag: str) -> os.PathLike:
        """
-            Download and returns pretrained resources path of current task.
+        Download and returns pretrained resources path of current task.
        """
        assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format(
            tag)
@@ -140,7 +140,7 @@ class ASRExecutor(BaseExecutor):
                        cfg_path: Optional[os.PathLike]=None,
                        ckpt_path: Optional[os.PathLike]=None):
        """
-            Init model and other resources from a specific path.
+        Init model and other resources from a specific path.
        """
        if hasattr(self, 'model'):
            logger.info('Model had been initialized.')
@@ -216,8 +216,8 @@ class ASRExecutor(BaseExecutor):
    def preprocess(self, model_type: str, input: Union[str, os.PathLike]):
        """
-            Input preprocess and return paddle.Tensor stored in self.input.
+        Input preprocess and return paddle.Tensor stored in self.input.
-            Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet).
+        Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet).
        """
        audio_file = input
@@ -291,7 +291,7 @@ class ASRExecutor(BaseExecutor):
    @paddle.no_grad()
    def infer(self, model_type: str):
        """
-            Model inference and result stored in self.output.
+        Model inference and result stored in self.output.
        """
        text_feature = TextFeaturizer(
            unit_type=self.config.collator.unit_type,
@@ -438,7 +438,7 @@ class ASRExecutor(BaseExecutor):
    def __call__(self, model, lang, sample_rate, config, ckpt_path, audio_file,
                 device):
        """
-            Python API to call an executor.
+        Python API to call an executor.
        """
        audio_file = os.path.abspath(audio_file)
        self._check(audio_file, sample_rate)

--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@@ -20,14 +20,14 @@ from typing import Union
 import numpy as np
 import paddle
 import yaml
+from paddleaudio import load
+from paddleaudio.features import LogMelSpectrogram
 from ..executor import BaseExecutor
 from ..utils import cli_register
 from ..utils import download_and_decompress
 from ..utils import logger
 from ..utils import MODEL_HOME
-from paddleaudio import load
-from paddleaudio.features import LogMelSpectrogram
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 __all__ = ['CLSExecutor']

--- a/paddlespeech/cli/download.py
+++ b/paddlespeech/cli/download.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import hashlib
+import os
+import os.path as osp
+import shutil
+import subprocess
+import sys
+import tarfile
+import time
+import zipfile
+import requests
+try:
+    from tqdm import tqdm
+except:
+    class tqdm(object):
+        def __init__(self, total=None):
+            self.total = total
+            self.n = 0
+        def update(self, n):
+            self.n += n
+            if self.total is None:
+                sys.stderr.write("\r{0:.1f} bytes".format(self.n))
+            else:
+                sys.stderr.write(
+                    "\r{0:.1f}%".format(100 * self.n / float(self.total)))
+            sys.stderr.flush()
+        def __enter__(self):
+            return self
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            sys.stderr.write('\n')
+import logging
+logger = logging.getLogger(__name__)
+__all__ = ['get_weights_path_from_url']
+WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights")
+DOWNLOAD_RETRY_LIMIT = 3
+def is_url(path):
+    """
+    Whether path is URL.
+    Args:
+        path (string): URL string or not.
+    """
+    return path.startswith('http://') or path.startswith('https://')
+def get_weights_path_from_url(url, md5sum=None):
+    """Get weights path from WEIGHT_HOME, if not exists,
+    download it from url.
+    Args:
+        url (str): download url
+        md5sum (str): md5 sum of download package
+    Returns:
+        str: a local path to save downloaded weights.
+    Examples:
+        .. code-block:: python
+            from paddle.utils.download import get_weights_path_from_url
+            resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams'
+            local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url)
+    """
+    path = get_path_from_url(url, WEIGHTS_HOME, md5sum)
+    return path
+def _map_path(url, root_dir):
+    # parse path after download under root_dir
+    fname = osp.split(url)[-1]
+    fpath = fname
+    return osp.join(root_dir, fpath)
+def _get_unique_endpoints(trainer_endpoints):
+    # Sorting is to avoid different environmental variables for each card
+    trainer_endpoints.sort()
+    ips = set()
+    unique_endpoints = set()
+    for endpoint in trainer_endpoints:
+        ip = endpoint.split(":")[0]
+        if ip in ips:
+            continue
+        ips.add(ip)
+        unique_endpoints.add(endpoint)
+    logger.info("unique_endpoints {}".format(unique_endpoints))
+    return unique_endpoints
+def get_path_from_url(url,
+                      root_dir,
+                      md5sum=None,
+                      check_exist=True,
+                      decompress=True,
+                      method='get'):
+    """ Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+    Args:
+        url (str): download url
+        root_dir (str): root dir for downloading, it should be
+                        WEIGHTS_HOME or DATASET_HOME
+        md5sum (str): md5 sum of download package
+        decompress (bool): decompress zip or tar file. Default is `True`
+        method (str): which download method to use. Support `wget` and `get`. Default is `get`.
+    Returns:
+        str: a local path to save downloaded models & weights & datasets.
+    """
+    from paddle.fluid.dygraph.parallel import ParallelEnv
+    assert is_url(url), "downloading from {} not a url".format(url)
+    # parse path after download to decompress under root_dir
+    fullpath = _map_path(url, root_dir)
+    # Mainly used to solve the problem of downloading data from different 
+    # machines in the case of multiple machines. Different ips will download 
+    # data, and the same ip will only download data once.
+    unique_endpoints = _get_unique_endpoints(ParallelEnv().trainer_endpoints[:])
+    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
+        logger.info("Found {}".format(fullpath))
+    else:
+        if ParallelEnv().current_endpoint in unique_endpoints:
+            fullpath = _download(url, root_dir, md5sum, method=method)
+        else:
+            while not os.path.exists(fullpath):
+                time.sleep(1)
+    if ParallelEnv().current_endpoint in unique_endpoints:
+        if decompress and (tarfile.is_tarfile(fullpath) or
+                           zipfile.is_zipfile(fullpath)):
+            fullpath = _decompress(fullpath)
+    return fullpath
+def _get_download(url, fullname):
+    # using requests.get method
+    fname = osp.basename(fullname)
+    try:
+        req = requests.get(url, stream=True)
+    except Exception as e:  # requests.exceptions.ConnectionError
+        logger.info("Downloading {} from {} failed with exception {}".format(
+            fname, url, str(e)))
+        return False
+    if req.status_code != 200:
+        raise RuntimeError("Downloading from {} failed with code "
+                           "{}!".format(url, req.status_code))
+    # For protecting download interupted, download to
+    # tmp_fullname firstly, move tmp_fullname to fullname
+    # after download finished
+    tmp_fullname = fullname + "_tmp"
+    total_size = req.headers.get('content-length')
+    with open(tmp_fullname, 'wb') as f:
+        if total_size:
+            with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
+                for chunk in req.iter_content(chunk_size=1024):
+                    f.write(chunk)
+                    pbar.update(1)
+        else:
+            for chunk in req.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+    shutil.move(tmp_fullname, fullname)
+    return fullname
+def _wget_download(url, fullname):
+    # using wget to download url
+    tmp_fullname = fullname + "_tmp"
+    # –user-agent
+    command = 'wget -O {} -t {} {}'.format(tmp_fullname, DOWNLOAD_RETRY_LIMIT,
+                                           url)
+    subprc = subprocess.Popen(
+        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    _ = subprc.communicate()
+    if subprc.returncode != 0:
+        raise RuntimeError(
+            '{} failed. Please make sure `wget` is installed or {} exists'.
+            format(command, url))
+    shutil.move(tmp_fullname, fullname)
+    return fullname
+_download_methods = {
+    'get': _get_download,
+    'wget': _wget_download,
+}
+def _download(url, path, md5sum=None, method='get'):
+    """
+    Download from url, save to path.
+    url (str): download url
+    path (str): download to given path
+    md5sum (str): md5 sum of download package
+    method (str): which download method to use. Support `wget` and `get`. Default is `get`.
+    """
+    assert method in _download_methods, 'make sure `{}` implemented'.format(
+        method)
+    if not osp.exists(path):
+        os.makedirs(path)
+    fname = osp.split(url)[-1]
+    fullname = osp.join(path, fname)
+    retry_cnt = 0
+    logger.info("Downloading {} from {}".format(fname, url))
+    while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. "
+                               "Retry limit reached".format(url))
+        if not _download_methods[method](url, fullname):
+            time.sleep(1)
+            continue
+    return fullname
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+    logger.info("File {} md5 checking...".format(fullname))
+    md5 = hashlib.md5()
+    with open(fullname, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+    if calc_md5sum != md5sum:
+        logger.info("File {} md5 check failed, {}(calc) != "
+                    "{}(base)".format(fullname, calc_md5sum, md5sum))
+        return False
+    return True
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+    if tarfile.is_tarfile(fname):
+        uncompressed_path = _uncompress_file_tar(fname)
+    elif zipfile.is_zipfile(fname):
+        uncompressed_path = _uncompress_file_zip(fname)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+    return uncompressed_path
+def _uncompress_file_zip(filepath):
+    files = zipfile.ZipFile(filepath, 'r')
+    file_list = files.namelist()
+    file_dir = os.path.dirname(filepath)
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+    files.close()
+    return uncompressed_path
+def _uncompress_file_tar(filepath, mode="r:*"):
+    files = tarfile.open(filepath, mode)
+    file_list = files.getnames()
+    file_dir = os.path.dirname(filepath)
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+    files.close()
+    return uncompressed_path
+def _is_a_single_file(file_list):
+    if len(file_list) == 1 and file_list[0].find(os.sep) < -1:
+        return True
+    return False
+def _is_a_single_dir(file_list):
+    new_file_list = []
+    for file_path in file_list:
+        if '/' in file_path:
+            file_path = file_path.replace('/', os.sep)
+        elif '\\' in file_path:
+            file_path = file_path.replace('\\', os.sep)
+        new_file_list.append(file_path)
+    file_name = new_file_list[0].split(os.sep)[0]
+    for i in range(1, len(new_file_list)):
+        if file_name != new_file_list[i].split(os.sep)[0]:
+            return False
+    return True
--- a/paddlespeech/cli/tts/__init.__py
+++ b/paddlespeech/cli/tts/__init.__py
--- a/paddlespeech/cli/tts/__init__.py
+++ b/paddlespeech/cli/tts/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import TTSExecutor
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
@@ -22,8 +22,8 @@ from typing import Dict
 from typing import List
 from paddle.framework import load
-from paddle.utils import download
+from . import download
 from .entry import commands
 __all__ = [
@@ -78,7 +78,6 @@ def _md5check(filepath: os.PathLike, md5sum: str) -> bool:
 def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike:
    file_dir = os.path.dirname(filepath)
    if tarfile.is_tarfile(filepath):
        files = tarfile.open(filepath, "r:*")
        file_list = files.getnames()
@@ -87,12 +86,11 @@ def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike:
        file_list = files.namelist()
    else:
        return file_dir
    if _is_a_single_file(file_list):
        rootpath = file_list[0]
        uncompressed_path = os.path.join(file_dir, rootpath)
    elif _is_a_single_dir(file_list):
-        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[0]
        uncompressed_path = os.path.join(file_dir, rootpath)
    else:
        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]

--- a/paddlespeech/cls/exps/panns/deploy/predict.py
+++ b/paddlespeech/cls/exps/panns/deploy/predict.py
@@ -16,11 +16,10 @@ import os
 import numpy as np
 from paddle import inference
-from scipy.special import softmax
 from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets import ESC50
 from paddleaudio.features import melspectrogram
+from scipy.special import softmax
 # yapf: disable
 parser = argparse.ArgumentParser()

--- a/paddlespeech/cls/exps/panns/export_model.py
+++ b/paddlespeech/cls/exps/panns/export_model.py
@@ -15,8 +15,8 @@ import argparse
 import os
 import paddle
 from paddleaudio.datasets import ESC50
 from paddlespeech.cls.models import cnn14
 from paddlespeech.cls.models import SoundClassifier

--- a/paddlespeech/cls/exps/panns/predict.py
+++ b/paddlespeech/cls/exps/panns/predict.py
@@ -16,11 +16,11 @@ import argparse
 import numpy as np
 import paddle
 import paddle.nn.functional as F
 from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets import ESC50
 from paddleaudio.features import LogMelSpectrogram
 from paddleaudio.features import melspectrogram
 from paddlespeech.cls.models import cnn14
 from paddlespeech.cls.models import SoundClassifier

--- a/paddlespeech/cls/exps/panns/train.py
+++ b/paddlespeech/cls/exps/panns/train.py
@@ -15,11 +15,11 @@ import argparse
 import os
 import paddle
 from paddleaudio.datasets import ESC50
 from paddleaudio.features import LogMelSpectrogram
 from paddleaudio.utils import logger
 from paddleaudio.utils import Timer
 from paddlespeech.cls.models import cnn14
 from paddlespeech.cls.models import SoundClassifier

--- a/paddlespeech/cls/models/panns/panns.py
+++ b/paddlespeech/cls/models/panns/panns.py
@@ -15,7 +15,6 @@ import os
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddleaudio.utils.download import load_state_dict_from_url
 from paddleaudio.utils.env import MODEL_HOME

--- a/paddlespeech/s2t/exps/deepspeech2/bin/__init__.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/__init__.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/s2t/exps/lm/transformer/bin/__init__.py
+++ b/paddlespeech/s2t/exps/lm/transformer/bin/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/s2t/exps/u2/bin/__init__.py
+++ b/paddlespeech/s2t/exps/u2/bin/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/s2t/exps/u2_kaldi/bin/__init__.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/bin/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/s2t/exps/u2_st/bin/__init__.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@@ -26,10 +26,8 @@ from paddle import distributed as dist
 from paddle.io import DataLoader
 from yacs.config import CfgNode
-from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
 from paddlespeech.s2t.io.collator import SpeechCollator
 from paddlespeech.s2t.io.collator import TripletSpeechCollator
-from paddlespeech.s2t.io.dataloader import BatchDataLoader
 from paddlespeech.s2t.io.dataset import ManifestDataset
 from paddlespeech.s2t.io.sampler import SortagradBatchSampler
 from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler

--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
@@ -71,8 +71,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
    vocoder.eval()
    print("model done!")
-    frontend = English()
+    frontend = English(phone_vocab_path=args.phones_dict)
-    punc = "：，；。？！“”‘’':,;.?!"
    print("frontend done!")
    stat = np.load(args.fastspeech2_stat)
@@ -95,16 +94,8 @@ def evaluate(args, fastspeech2_config, pwg_config):
    # only test the number 0 speaker
    spk_id = 0
    for utt_id, sentence in sentences:
-        phones = frontend.phoneticize(sentence)
+        input_ids = frontend.get_input_ids(sentence)
-        # remove start_symbol and end_symbol
+        phone_ids = input_ids["phone_ids"]
-        phones = phones[1:-1]
-        phones = [phn for phn in phones if not phn.isspace()]
-        phones = [
-            phn if (phn in phone_id_map and phn not in punc) else "sp"
-            for phn in phones
-        ]
-        phone_ids = [phone_id_map[phn] for phn in phones]
-        phone_ids = paddle.to_tensor(phone_ids)
        with paddle.no_grad():
            mel = fastspeech2_inference(

--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py
@@ -63,8 +63,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
    vocoder.eval()
    print("model done!")
-    frontend = English()
+    frontend = English(phone_vocab_path=args.phones_dict)
-    punc = "：，；。？！“”‘’':,;.?!"
    print("frontend done!")
    stat = np.load(args.fastspeech2_stat)
@@ -86,16 +85,8 @@ def evaluate(args, fastspeech2_config, pwg_config):
    output_dir.mkdir(parents=True, exist_ok=True)
    for utt_id, sentence in sentences:
-        phones = frontend.phoneticize(sentence)
+        input_ids = frontend.get_input_ids(sentence)
-        # remove start_symbol and end_symbol
+        phone_ids = input_ids["phone_ids"]
-        phones = phones[1:-1]
-        phones = [phn for phn in phones if not phn.isspace()]
-        phones = [
-            phn if (phn in phone_id_map and phn not in punc) else "sp"
-            for phn in phones
-        ]
-        phone_ids = [phone_id_map[phn] for phn in phones]
-        phone_ids = paddle.to_tensor(phone_ids)
        with paddle.no_grad():
            mel = fastspeech2_inference(phone_ids)

--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -14,6 +14,7 @@
 from abc import ABC
 from abc import abstractmethod
+import paddle
 from g2p_en import G2p
 from g2pM import G2pM
@@ -45,20 +46,25 @@ class English(Phonetics):
    """ Normalize the input text sequence and convert into pronunciation id sequence.
    """
-    def __init__(self):
+    def __init__(self, phone_vocab_path=None):
        self.backend = G2p()
        self.phonemes = list(self.backend.phonemes)
        self.punctuations = get_punctuations("en")
        self.vocab = Vocab(self.phonemes + self.punctuations)
+        self.vocab_phones = {}
+        self.punc = "：，；。？！“”‘’':,;.?!"
+        if phone_vocab_path:
+            with open(phone_vocab_path, 'rt') as f:
+                phn_id = [line.strip().split() for line in f.readlines()]
+            for phn, id in phn_id:
+                self.vocab_phones[phn] = int(id)
    def phoneticize(self, sentence):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
        Parameters
        -----------
        sentence: str
            The input text sequence.
        Returns
        ----------
        List[str]
@@ -72,14 +78,27 @@ class English(Phonetics):
        phonemes = [item for item in phonemes if item in self.vocab.stoi]
        return phonemes
+    def get_input_ids(self, sentence: str) -> paddle.Tensor:
+        result = {}
+        phones = self.phoneticize(sentence)
+        # remove start_symbol and end_symbol
+        phones = phones[1:-1]
+        phones = [phn for phn in phones if not phn.isspace()]
+        phones = [
+            phn if (phn in self.vocab_phones and phn not in self.punc) else "sp"
+            for phn in phones
+        ]
+        phone_ids = [self.vocab_phones[phn] for phn in phones]
+        phone_ids = paddle.to_tensor(phone_ids)
+        result["phone_ids"] = phone_ids
+        return result
    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
        Parameters
        -----------
        phonemes: List[str]
            The list of pronunciation sequence.
        Returns
        ----------
        List[int]
@@ -93,12 +112,10 @@ class English(Phonetics):
    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
        Parameters
        -----------
        ids: List[int]
            The list of pronunciation id sequence.
        Returns
        ----------
        List[str]
@@ -108,12 +125,10 @@ class English(Phonetics):
    def __call__(self, sentence):
        """ Convert the input text sequence into pronunciation id sequence.
        Parameters
        -----------
        sentence: str
            The input text sequence.
        Returns
        ----------
        List[str]
@@ -140,12 +155,10 @@ class EnglishCharacter(Phonetics):
    def phoneticize(self, sentence):
        """ Normalize the input text sequence.
        Parameters
        -----------
        sentence: str
            The input text sequence.
        Returns
        ----------
        str
@@ -156,12 +169,10 @@ class EnglishCharacter(Phonetics):
    def numericalize(self, sentence):
        """ Convert a text sequence into ids.
        Parameters
        -----------
        sentence: str
            The input text sequence.
        Returns
        ----------
        List[int]
@@ -175,17 +186,14 @@ class EnglishCharacter(Phonetics):
    def reverse(self, ids):
        """ Convert a character id sequence into text.
        Parameters
        -----------
        ids: List[int]
            List of a character id sequence.
        Returns
        ----------
        str
            The input text sequence.
        """
        return [self.vocab.reverse(i) for i in ids]
@@ -195,7 +203,6 @@ class EnglishCharacter(Phonetics):
        -----------
        sentence: str
            The input text sequence.
        Returns
        ----------
        List[int]
@@ -229,12 +236,10 @@ class Chinese(Phonetics):
    def phoneticize(self, sentence):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
        Parameters
        -----------
        sentence: str
            The input text sequence.
        Returns
        ----------
        List[str]
@@ -263,12 +268,10 @@ class Chinese(Phonetics):
    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
        Parameters
        -----------
        phonemes: List[str]
            The list of pronunciation sequence.
        Returns
        ----------
        List[int]
@@ -279,12 +282,10 @@ class Chinese(Phonetics):
    def __call__(self, sentence):
        """ Convert the input text sequence into pronunciation id sequence.
        Parameters
        -----------
        sentence: str
            The input text sequence.
        Returns
        ----------
        List[str]
@@ -300,12 +301,10 @@ class Chinese(Phonetics):
    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
        Parameters
        -----------
        ids: List[int]
            The list of pronunciation id sequence.
        Returns
        ----------
        List[str]

--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@@ -5,7 +5,6 @@ import functools
 from pathlib import Path
 import jsonlines
 from utils.utility import add_arguments
 from utils.utility import print_arguments