reneame chunk to block in streaming tts, test=tts

004ab8d0 · 小湉湉 · 0fa32e4a · 004ab8d0 · 004ab8d0 · 004ab8d0
4 changed file
--- a/paddlespeech/t2s/exps/inference_streaming.py
+++ b/paddlespeech/t2s/exps/inference_streaming.py
@@ -90,7 +90,7 @@ def parse_args():
        default=False,
        help="whether use streaming acoustic model")
    parser.add_argument(
-        "--chunk_size", type=int, default=42, help="chunk size of am streaming")
+        "--block_size", type=int, default=42, help="block size of am streaming")
    parser.add_argument(
        "--pad_size", type=int, default=12, help="pad size of am streaming")
@@ -169,7 +169,7 @@ def main():
    N = 0
    T = 0
-    chunk_size = args.chunk_size
+    block_size = args.block_size
    pad_size = args.pad_size
    get_tone_ids = False
    for utt_id, sentence in sentences:
@@ -189,7 +189,7 @@ def main():
                am_encoder_infer_predictor, input=phones)
            if args.am_streaming:
-                hss = get_chunks(orig_hs, chunk_size, pad_size)
+                hss = get_chunks(orig_hs, block_size, pad_size)
                chunk_num = len(hss)
                mel_list = []
                for i, hs in enumerate(hss):
@@ -211,7 +211,7 @@ def main():
                        sub_mel = sub_mel[pad_size:]
                    else:
                        # 倒数几块的右侧也可能没有 pad 够
-                        sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
+                        sub_mel = sub_mel[pad_size:(block_size + pad_size) -
                                          sub_mel.shape[0]]
                    mel_list.append(sub_mel)
                mel = np.concatenate(mel_list, axis=0)

--- a/paddlespeech/t2s/exps/ort_predict_streaming.py
+++ b/paddlespeech/t2s/exps/ort_predict_streaming.py
@@ -97,7 +97,7 @@ def ort_predict(args):
    T = 0
    merge_sentences = True
    get_tone_ids = False
-    chunk_size = args.chunk_size
+    block_size = args.block_size
    pad_size = args.pad_size
    for utt_id, sentence in sentences:
@@ -115,7 +115,7 @@ def ort_predict(args):
            orig_hs = am_encoder_infer_sess.run(
                None, input_feed={'text': phone_ids})
            if args.am_streaming:
-                hss = get_chunks(orig_hs[0], chunk_size, pad_size)
+                hss = get_chunks(orig_hs[0], block_size, pad_size)
                chunk_num = len(hss)
                mel_list = []
                for i, hs in enumerate(hss):
@@ -139,7 +139,7 @@ def ort_predict(args):
                        sub_mel = sub_mel[pad_size:]
                    else:
                        # 倒数几块的右侧也可能没有 pad 够
-                        sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
+                        sub_mel = sub_mel[pad_size:(block_size + pad_size) -
                                          sub_mel.shape[0]]
                    mel_list.append(sub_mel)
                mel = np.concatenate(mel_list, axis=0)
@@ -236,7 +236,7 @@ def parse_args():
        default=False,
        help="whether use streaming acoustic model")
    parser.add_argument(
-        "--chunk_size", type=int, default=42, help="chunk size of am streaming")
+        "--block_size", type=int, default=42, help="block size of am streaming")
    parser.add_argument(
        "--pad_size", type=int, default=12, help="pad size of am streaming")

--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -75,13 +75,13 @@ def denorm(data, mean, std):
    return data * std + mean
-def get_chunks(data, chunk_size: int, pad_size: int):
+def get_chunks(data, block_size: int, pad_size: int):
    data_len = data.shape[1]
    chunks = []
-    n = math.ceil(data_len / chunk_size)
+    n = math.ceil(data_len / block_size)
    for i in range(n):
-        start = max(0, i * chunk_size - pad_size)
+        start = max(0, i * block_size - pad_size)
-        end = min((i + 1) * chunk_size + pad_size, data_len)
+        end = min((i + 1) * block_size + pad_size, data_len)
        chunks.append(data[:, start:end, :])
    return chunks

--- a/paddlespeech/t2s/exps/synthesize_streaming.py
+++ b/paddlespeech/t2s/exps/synthesize_streaming.py
@@ -133,7 +133,7 @@ def evaluate(args):
    N = 0
    T = 0
-    chunk_size = args.chunk_size
+    block_size = args.block_size
    pad_size = args.pad_size
    for utt_id, sentence in sentences:
@@ -153,7 +153,7 @@ def evaluate(args):
                # acoustic model
                orig_hs = am_encoder_infer(phone_ids)
                if args.am_streaming:
-                    hss = get_chunks(orig_hs, chunk_size, pad_size)
+                    hss = get_chunks(orig_hs, block_size, pad_size)
                    chunk_num = len(hss)
                    mel_list = []
                    for i, hs in enumerate(hss):
@@ -171,7 +171,7 @@ def evaluate(args):
                            sub_mel = sub_mel[pad_size:]
                        else:
                            # 倒数几块的右侧也可能没有 pad 够
-                            sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
+                            sub_mel = sub_mel[pad_size:(block_size + pad_size) -
                                              sub_mel.shape[0]]
                        mel_list.append(sub_mel)
                    mel = paddle.concat(mel_list, axis=0)
@@ -277,7 +277,7 @@ def parse_args():
        default=False,
        help="whether use streaming acoustic model")
    parser.add_argument(
-        "--chunk_size", type=int, default=42, help="chunk size of am streaming")
+        "--block_size", type=int, default=42, help="block size of am streaming")
    parser.add_argument(
        "--pad_size", type=int, default=12, help="pad size of am streaming")