Merge pull request #833 from PaddlePaddle/fix_varbase

varbase getitem support np.longlong since paddle 2.2.0RC

Merge pull request #833 from PaddlePaddle/fix_varbase
varbase getitem support np.longlong since paddle 2.2.0RC
98c0d43a · Hui Zhang · GitHub · 6f7a6dc2 · 282914f4 · 98c0d43a
4 changed file
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -587,26 +587,25 @@ class U2Tester(U2Trainer):
                ctc_probs = ctc_probs.squeeze(0)
                target = target.squeeze(0)
                alignment = ctc_utils.forced_align(ctc_probs, target)
-                logger.info("align ids", key[0], alignment)
+                logger.info(f"align ids: {key[0]} {alignment}")
                fout.write('{} {}\n'.format(key[0], alignment))
                # 3. gen praat
                # segment alignment
                align_segs = text_grid.segment_alignment(alignment)
-                logger.info("align tokens", key[0], align_segs)
+                logger.info(f"align tokens: {key[0]}, {align_segs}")
                # IntervalTier, List["start end token\n"]
                subsample = utility.get_subsample(self.config)
                tierformat = text_grid.align_to_tierformat(
                    align_segs, subsample, token_dict)
                # write tier
-                align_output_path = os.path.join(
+                align_output_path = Path(self.args.result_file).parent / "align"
-                    os.path.dirname(self.args.result_file), "align")
+                align_output_path.mkdir(parents=True, exist_ok=True)
-                tier_path = os.path.join(align_output_path, key[0] + ".tier")
+                tier_path = align_output_path / (key[0] + ".tier")
-                with open(tier_path, 'w') as f:
+                with tier_path.open('w') as f:
                    f.writelines(tierformat)
                # write textgrid
-                textgrid_path = os.path.join(align_output_path,
+                textgrid_path = align_output_path / (key[0] + ".TextGrid")
-                                             key[0] + ".TextGrid")
                second_per_frame = 1. / (1000. /
                                         stride_ms)  # 25ms window, 10ms stride
                second_per_example = (
@@ -614,7 +613,7 @@ class U2Tester(U2Trainer):
                text_grid.generate_textgrid(
                    maxtime=second_per_example,
                    intervals=tierformat,
-                    output=textgrid_path)
+                    output=str(textgrid_path))
    def run_align(self):
        self.resume_or_scratch()

--- a/deepspeech/exps/u2_kaldi/model.py
+++ b/deepspeech/exps/u2_kaldi/model.py
@@ -546,9 +546,8 @@ class U2Tester(U2Trainer):
        self.model.eval()
        logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}")
-        stride_ms = self.config.collater.stride_ms
+        stride_ms = self.align_loader.collate_fn.stride_ms
-        token_dict = self.args.char_list
+        token_dict = self.align_loader.collate_fn.vocab_list
        with open(self.args.result_file, 'w') as fout:
            # one example in batch
            for i, batch in enumerate(self.align_loader):
@@ -565,26 +564,25 @@ class U2Tester(U2Trainer):
                ctc_probs = ctc_probs.squeeze(0)
                target = target.squeeze(0)
                alignment = ctc_utils.forced_align(ctc_probs, target)
-                logger.info("align ids", key[0], alignment)
+                logger.info(f"align ids: {key[0]} {alignment}")
                fout.write('{} {}\n'.format(key[0], alignment))
                # 3. gen praat
                # segment alignment
                align_segs = text_grid.segment_alignment(alignment)
-                logger.info("align tokens", key[0], align_segs)
+                logger.info(f"align tokens: {key[0]}, {align_segs}")
                # IntervalTier, List["start end token\n"]
                subsample = utility.get_subsample(self.config)
                tierformat = text_grid.align_to_tierformat(
                    align_segs, subsample, token_dict)
                # write tier
-                align_output_path = os.path.join(
+                align_output_path = Path(self.args.result_file).parent / "align"
-                    os.path.dirname(self.args.result_file), "align")
+                align_output_path.mkdir(parents=True, exist_ok=True)
-                tier_path = os.path.join(align_output_path, key[0] + ".tier")
+                tier_path = align_output_path / (key[0] + ".tier")
-                with open(tier_path, 'w') as f:
+                with tier_path.open('w') as f:
                    f.writelines(tierformat)
                # write textgrid
-                textgrid_path = os.path.join(align_output_path,
+                textgrid_path = align_output_path / (key[0] + ".TextGrid")
-                                             key[0] + ".TextGrid")
                second_per_frame = 1. / (1000. /
                                         stride_ms)  # 25ms window, 10ms stride
                second_per_example = (
@@ -592,7 +590,7 @@ class U2Tester(U2Trainer):
                text_grid.generate_textgrid(
                    maxtime=second_per_example,
                    intervals=tierformat,
-                    output=textgrid_path)
+                    output=str(textgrid_path))
    def run_align(self):
        self.resume_or_scratch()

--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@@ -596,26 +596,25 @@ class U2STTester(U2STTrainer):
                ctc_probs = ctc_probs.squeeze(0)
                target = target.squeeze(0)
                alignment = ctc_utils.forced_align(ctc_probs, target)
-                logger.info("align ids", key[0], alignment)
+                logger.info(f"align ids: {key[0]} {alignment}")
                fout.write('{} {}\n'.format(key[0], alignment))
                # 3. gen praat
                # segment alignment
                align_segs = text_grid.segment_alignment(alignment)
-                logger.info("align tokens", key[0], align_segs)
+                logger.info(f"align tokens: {key[0]}, {align_segs}")
                # IntervalTier, List["start end token\n"]
                subsample = utility.get_subsample(self.config)
                tierformat = text_grid.align_to_tierformat(
                    align_segs, subsample, token_dict)
                # write tier
-                align_output_path = os.path.join(
+                align_output_path = Path(self.args.result_file).parent / "align"
-                    os.path.dirname(self.args.result_file), "align")
+                align_output_path.mkdir(parents=True, exist_ok=True)
-                tier_path = os.path.join(align_output_path, key[0] + ".tier")
+                tier_path = align_output_path / (key[0] + ".tier")
-                with open(tier_path, 'w') as f:
+                with tier_path.open('w') as f:
                    f.writelines(tierformat)
                # write textgrid
-                textgrid_path = os.path.join(align_output_path,
+                textgrid_path = align_output_path / (key[0] + ".TextGrid")
-                                             key[0] + ".TextGrid")
                second_per_frame = 1. / (1000. /
                                         stride_ms)  # 25ms window, 10ms stride
                second_per_example = (
@@ -623,7 +622,7 @@ class U2STTester(U2STTrainer):
                text_grid.generate_textgrid(
                    maxtime=second_per_example,
                    intervals=tierformat,
-                    output=textgrid_path)
+                    output=str(textgrid_path))
    def run_align(self):
        self.resume_or_scratch()

--- a/deepspeech/utils/ctc_utils.py
+++ b/deepspeech/utils/ctc_utils.py
@@ -86,15 +86,15 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
    log_alpha = paddle.zeros(
        (ctc_probs.shape[0], len(y_insert_blank)))  #(T, 2L+1)
    log_alpha = log_alpha - float('inf')  # log of zero
-    # TODO(Hui Zhang): zeros not support paddle.int16
+    # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
    state_path = (paddle.zeros(
        (ctc_probs.shape[0], len(y_insert_blank)), dtype=paddle.int32) - 1
                  )  # state path, Tuple((T, 2L+1))
    # init start state
-    # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64
+    log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]]  # State-b, Sb
-    log_alpha[0, 0] = ctc_probs[0][int(y_insert_blank[0])]  # State-b, Sb
+    log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]]  # State-nb, Snb
-    log_alpha[0, 1] = ctc_probs[0][int(y_insert_blank[1])]  # State-nb, Snb
    for t in range(1, ctc_probs.shape[0]):  # T
        for s in range(len(y_insert_blank)):  # 2L+1
@@ -110,12 +110,10 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
                    log_alpha[t - 1, s - 2],
                ])
                prev_state = [s, s - 1, s - 2]
-            # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64
+            log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][
-            log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][int(
+                y_insert_blank[s]]
-                y_insert_blank[s])]
            state_path[t, s] = prev_state[paddle.argmax(candidates)]
+    # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
-    # TODO(Hui Zhang): zeros not support paddle.int16
    state_seq = -1 * paddle.ones((ctc_probs.shape[0], 1), dtype=paddle.int32)
    candidates = paddle.to_tensor([