diff --git a/deepspeech/io/collator_st.py b/deepspeech/io/collator_st.py index 34933312148d29bd4e111ee8494e7f32f23dc89b..1be6445d703e86c0af27002d77a89685e3423e28 100644 --- a/deepspeech/io/collator_st.py +++ b/deepspeech/io/collator_st.py @@ -563,7 +563,7 @@ class KaldiPrePorocessedCollator(SpeechCollator): @property def feature_size(self): return self._feat_dim - + @property def stride_ms(self): return self._stride_ms diff --git a/deepspeech/utils/bleu_score.py b/deepspeech/utils/bleu_score.py index 580fbf6124d54eb10bf0dae2ffc1c926e9619b2a..f1bf5261ee3ad67759234a1a87efd9b657324e51 100644 --- a/deepspeech/utils/bleu_score.py +++ b/deepspeech/utils/bleu_score.py @@ -35,6 +35,7 @@ def bleu(hypothesis, reference): return sacrebleu.corpus_bleu(hypothesis, reference) + def char_bleu(hypothesis, reference): """Calculate BLEU. BLEU compares reference text and hypothesis text in char-level using scarebleu. @@ -47,7 +48,8 @@ def char_bleu(hypothesis, reference): :type hypothesis: list[str] :raises ValueError: If the reference number is zero. """ - hypothesis =[' '.join(list(hyp.replace(' ', ''))) for hyp in hypothesis] - reference = [[' '.join(list(ref_i.replace(' ', ''))) for ref_i in ref ]for ref in reference ] + hypothesis = [' '.join(list(hyp.replace(' ', ''))) for hyp in hypothesis] + reference = [[' '.join(list(ref_i.replace(' ', ''))) for ref_i in ref] + for ref in reference] - return sacrebleu.corpus_bleu(hypothesis, reference) \ No newline at end of file + return sacrebleu.corpus_bleu(hypothesis, reference) diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/examples/dataset/ted_en_zh/ted_en_zh.py index 08f15119e7a5e20bc5508c56c55f2eff85e17cae..14bef01d2b129a04dc0aac21765321893c470ac8 100644 --- a/examples/dataset/ted_en_zh/ted_en_zh.py +++ b/examples/dataset/ted_en_zh/ted_en_zh.py @@ -44,9 +44,11 @@ def create_manifest(data_dir, manifest_path_prefix): print("Creating manifest %s ..." % manifest_path_prefix) json_lines = [] - data_types_infos = [('train', 'train-split/train-segment', 'En-Zh/train.en-zh'), - ('dev', 'test-segment/tst2010', 'En-Zh/tst2010.en-zh'), - ('test', 'test-segment/tst2015', 'En-Zh/tst2015.en-zh')] + data_types_infos = [ + ('train', 'train-split/train-segment', 'En-Zh/train.en-zh'), + ('dev', 'test-segment/tst2010', 'En-Zh/tst2010.en-zh'), + ('test', 'test-segment/tst2015', 'En-Zh/tst2015.en-zh') + ] for data_info in data_types_infos: dtype, audio_relative_dir, text_relative_path = data_info del json_lines[:] @@ -63,7 +65,7 @@ def create_manifest(data_dir, manifest_path_prefix): continue audio_id, trancription, translation = line.split('\t') utt = audio_id.split('.')[0] - + audio_path = os.path.join(audio_dir, audio_id) if os.path.exists(audio_path): if os.path.getsize(audio_path) < 30000: diff --git a/examples/ted_en_zh/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/conf/transformer_joint_noam.yaml index ba384f8cda816e6f56f670b4448e7d1c74bb5d51..bc1f8890d9d784053ce1ed84c9cbb8675f39782b 100644 --- a/examples/ted_en_zh/conf/transformer_joint_noam.yaml +++ b/examples/ted_en_zh/conf/transformer_joint_noam.yaml @@ -3,8 +3,8 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test - min_input_len: 0.5 # second - max_input_len: 3000.0 # second + min_input_len: 0.05 # second + max_input_len: 30.0 # second min_output_len: 0.0 # tokens max_output_len: 400.0 # tokens min_output_input_ratio: 0.01