Merge branch 'develop' of github.com:SmileGoat/PaddleSpeech into refactor_file_struct

4b81cd0f · Yang Zhou · 156ccfe4 · 0d34c624 · 4b81cd0f · 4b81cd0f
7 changed file
--- a/docker/ubuntu18-cpu/Dockerfile
+++ b/docker/ubuntu18-cpu/Dockerfile
+FROM registry.baidubce.com/paddlepaddle/paddle:2.2.2
+LABEL maintainer="paddlesl@baidu.com"
+
+RUN git clone --depth 1 https://github.com/PaddlePaddle/PaddleSpeech.git /home/PaddleSpeech  
+RUN pip3 uninstall mccabe -y ; exit 0;
+RUN pip3 install multiprocess==0.70.12 importlib-metadata==4.2.0 dill==0.3.4
+
+RUN cd /home/PaddleSpeech/audio
+RUN python setup.py bdist_wheel
+
+RUN cd /home/PaddleSpeech
+RUN python setup.py bdist_wheel
+RUN pip install audio/dist/*.whl dist/*.whl
+
+WORKDIR /home/PaddleSpeech/
--- a/paddlespeech/resource/resource.py
+++ b/paddlespeech/resource/resource.py
@@ -82,7 +82,7 @@ class CommonTaskResource:
            self.model_tag = model_tag
            self.version = version
            self.res_dict = self.pretrained_models[model_tag][version]
-            self.format_path(self.res_dict)
+            self._format_path(self.res_dict)
            self.res_dir = self._fetch(self.res_dict,
                                       self._get_model_dir(model_type))
        else:
@@ -90,19 +90,10 @@ class CommonTaskResource:
            self.voc_model_tag = model_tag
            self.voc_version = version
            self.voc_res_dict = self.pretrained_models[model_tag][version]
-            self.format_path(self.voc_res_dict)
+            self._format_path(self.voc_res_dict)
            self.voc_res_dir = self._fetch(self.voc_res_dict,
                                           self._get_model_dir(model_type))

-    @staticmethod
-    def format_path(res_dict: Dict[str, str]):
-        for k, v in res_dict.items():
-            if '/' in v:
-                if v.startswith('https://') or v.startswith('http://'):
-                    continue
-                else:
-                    res_dict[k] = os.path.join(*(v.split('/')))
-
    @staticmethod
    def get_model_class(model_name) -> List[object]:
        """Dynamic import model class.
@@ -231,3 +222,12 @@ class CommonTaskResource:
            os.PathLike: Directory of model resource.
        """
        return download_and_decompress(res_dict, target_dir)
+
+    @staticmethod
+    def _format_path(res_dict: Dict[str, str]):
+        for k, v in res_dict.items():
+            if isinstance(v, str) and '/' in v:
+                if v.startswith('https://') or v.startswith('http://'):
+                    continue
+                else:
+                    res_dict[k] = os.path.join(*(v.split('/')))
--- a/paddlespeech/server/engine/tts/online/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py
@@ -44,7 +44,7 @@ class TTSServerExecutor(TTSExecutor):
        self.voc_block = voc_block
        self.voc_pad = voc_pad
        self.task_resource = CommonTaskResource(
-            task='tts', model_format='static', inference_mode='online')
+            task='tts', model_format='dynamic', inference_mode='online')

    def get_model_info(self,
                       field: str,

--- a/paddlespeech/t2s/exps/inference_streaming.py
+++ b/paddlespeech/t2s/exps/inference_streaming.py
@@ -90,7 +90,7 @@ def parse_args():
        default=False,
        help="whether use streaming acoustic model")
    parser.add_argument(
-        "--chunk_size", type=int, default=42, help="chunk size of am streaming")
+        "--block_size", type=int, default=42, help="block size of am streaming")
    parser.add_argument(
        "--pad_size", type=int, default=12, help="pad size of am streaming")

@@ -169,7 +169,7 @@ def main():

    N = 0
    T = 0
-    chunk_size = args.chunk_size
+    block_size = args.block_size
    pad_size = args.pad_size
    get_tone_ids = False
    for utt_id, sentence in sentences:
@@ -189,7 +189,7 @@ def main():
                am_encoder_infer_predictor, input=phones)

            if args.am_streaming:
-                hss = get_chunks(orig_hs, chunk_size, pad_size)
+                hss = get_chunks(orig_hs, block_size, pad_size)
                chunk_num = len(hss)
                mel_list = []
                for i, hs in enumerate(hss):
@@ -211,7 +211,7 @@ def main():
                        sub_mel = sub_mel[pad_size:]
                    else:
                        # 倒数几块的右侧也可能没有 pad 够
-                        sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
+                        sub_mel = sub_mel[pad_size:(block_size + pad_size) -
                                          sub_mel.shape[0]]
                    mel_list.append(sub_mel)
                mel = np.concatenate(mel_list, axis=0)

--- a/paddlespeech/t2s/exps/ort_predict_streaming.py
+++ b/paddlespeech/t2s/exps/ort_predict_streaming.py
@@ -97,7 +97,7 @@ def ort_predict(args):
    T = 0
    merge_sentences = True
    get_tone_ids = False
-    chunk_size = args.chunk_size
+    block_size = args.block_size
    pad_size = args.pad_size

    for utt_id, sentence in sentences:
@@ -115,7 +115,7 @@ def ort_predict(args):
            orig_hs = am_encoder_infer_sess.run(
                None, input_feed={'text': phone_ids})
            if args.am_streaming:
-                hss = get_chunks(orig_hs[0], chunk_size, pad_size)
+                hss = get_chunks(orig_hs[0], block_size, pad_size)
                chunk_num = len(hss)
                mel_list = []
                for i, hs in enumerate(hss):
@@ -139,7 +139,7 @@ def ort_predict(args):
                        sub_mel = sub_mel[pad_size:]
                    else:
                        # 倒数几块的右侧也可能没有 pad 够
-                        sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
+                        sub_mel = sub_mel[pad_size:(block_size + pad_size) -
                                          sub_mel.shape[0]]
                    mel_list.append(sub_mel)
                mel = np.concatenate(mel_list, axis=0)
@@ -236,7 +236,7 @@ def parse_args():
        default=False,
        help="whether use streaming acoustic model")
    parser.add_argument(
-        "--chunk_size", type=int, default=42, help="chunk size of am streaming")
+        "--block_size", type=int, default=42, help="block size of am streaming")
    parser.add_argument(
        "--pad_size", type=int, default=12, help="pad size of am streaming")


--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -75,13 +75,13 @@ def denorm(data, mean, std):
    return data * std + mean


-def get_chunks(data, chunk_size: int, pad_size: int):
+def get_chunks(data, block_size: int, pad_size: int):
    data_len = data.shape[1]
    chunks = []
-    n = math.ceil(data_len / chunk_size)
+    n = math.ceil(data_len / block_size)
    for i in range(n):
-        start = max(0, i * chunk_size - pad_size)
-        end = min((i + 1) * chunk_size + pad_size, data_len)
+        start = max(0, i * block_size - pad_size)
+        end = min((i + 1) * block_size + pad_size, data_len)
        chunks.append(data[:, start:end, :])
    return chunks


--- a/paddlespeech/t2s/exps/synthesize_streaming.py
+++ b/paddlespeech/t2s/exps/synthesize_streaming.py
@@ -133,7 +133,7 @@ def evaluate(args):

    N = 0
    T = 0
-    chunk_size = args.chunk_size
+    block_size = args.block_size
    pad_size = args.pad_size

    for utt_id, sentence in sentences:
@@ -153,7 +153,7 @@ def evaluate(args):
                # acoustic model
                orig_hs = am_encoder_infer(phone_ids)
                if args.am_streaming:
-                    hss = get_chunks(orig_hs, chunk_size, pad_size)
+                    hss = get_chunks(orig_hs, block_size, pad_size)
                    chunk_num = len(hss)
                    mel_list = []
                    for i, hs in enumerate(hss):
@@ -171,7 +171,7 @@ def evaluate(args):
                            sub_mel = sub_mel[pad_size:]
                        else:
                            # 倒数几块的右侧也可能没有 pad 够
-                            sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
+                            sub_mel = sub_mel[pad_size:(block_size + pad_size) -
                                              sub_mel.shape[0]]
                        mel_list.append(sub_mel)
                    mel = paddle.concat(mel_list, axis=0)
@@ -277,7 +277,7 @@ def parse_args():
        default=False,
        help="whether use streaming acoustic model")
    parser.add_argument(
-        "--chunk_size", type=int, default=42, help="chunk size of am streaming")
+        "--block_size", type=int, default=42, help="block size of am streaming")
    parser.add_argument(
        "--pad_size", type=int, default=12, help="pad size of am streaming")