fix conf for new datapipe; u2 export inputspec

9b3acddd · Hui Zhang · d1c280cc · 9b3acddd · 9b3acddd · 9b3acddd
5 changed file
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -511,10 +511,9 @@ class U2Tester(U2Trainer):
                                                   self.args.checkpoint_path)
        feat_dim = self.test_loader.collate_fn.feature_size
        input_spec = [
-            paddle.static.InputSpec(
-                shape=[None, feat_dim, None],
-                dtype='float32'),  # audio, [B,D,T]
-            paddle.static.InputSpec(shape=[None],
+            paddle.static.InputSpec(shape=[1, None, feat_dim],
+                                    dtype='float32'),  # audio, [B,T,D]
+            paddle.static.InputSpec(shape=[1],
                                    dtype='int64'),  # audio_length, [B]
        ]
        return infer_model, input_spec

--- a/examples/librispeech/s1/conf/chunk_confermer.yaml
+++ b/examples/librispeech/s1/conf/chunk_confermer.yaml
@@ -3,18 +3,20 @@ data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
-  vocab_filepath: data/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_5000'
-  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
-  batch_size: 4
  min_input_len: 0.5
  max_input_len: 20.0
  min_output_len: 0.0
  max_output_len: 400.0
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
+
+collator:
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_5000'
+  mean_std_filepath: ""
+  augmentation_config: conf/augmentation.json
+  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80
@@ -80,7 +82,7 @@ model:

 training:
  n_epoch: 120
-  accum_grad: 1
+  accum_grad: 8
  global_grad_clip: 5.0
  optim: adam
  optim_conf:

--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
@@ -3,18 +3,20 @@ data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
-  vocab_filepath: data/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_5000'
-  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
-  batch_size: 64
  min_input_len: 0.5  # second
  max_input_len: 20.0 # second
  min_output_len: 0.0 # tokens
  max_output_len: 400.0 # tokens
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
+
+collator:
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_5000'
+  mean_std_filepath: ""
+  augmentation_config: conf/augmentation.json
+  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80
@@ -103,6 +105,6 @@ decoding:
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+  simulate_streaming: true  # simulate streaming inference. Defaults to False.


--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@@ -3,18 +3,20 @@ data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test-clean
-  vocab_filepath: data/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_5000'
-  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
-  batch_size: 16
  min_input_len: 0.5  # seconds
  max_input_len: 20.0 # seconds
  min_output_len: 0.0 # tokens
  max_output_len: 400.0 # tokens
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
+
+collator:
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_5000'
+  mean_std_filepath: ""
+  augmentation_config: conf/augmentation.json
+  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80

--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@@ -3,18 +3,20 @@ data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test-clean
-  vocab_filepath: data/vocab.txt
-  unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_5000'
-  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
-  batch_size: 64
  min_input_len: 0.5  # second
  max_input_len: 20.0 # second
  min_output_len: 0.0 # tokens
  max_output_len: 400.0 # tokens
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
+
+collator:
+  vocab_filepath: data/vocab.txt
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_5000'
+  mean_std_filepath: ""
+  augmentation_config: conf/augmentation.json
+  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80