Merge branch 'develop' into rsl

f8d52e59 · Hui Zhang · GitHub · 03e69525 · 718bd307 · f8d52e59
14 changed file
--- a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py
+++ b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py
@@ -18,8 +18,10 @@ import numpy as np
 import paddle
 from paddle.inference import Config
 from paddle.inference import create_predictor
+from paddle.io import DataLoader
 from deepspeech.exps.deepspeech2.config import get_cfg_defaults
+from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.deepspeech2 import DeepSpeech2Model
 from deepspeech.training.cli import default_argument_parser
@@ -78,26 +80,31 @@ def inference(config, args):
 def start_server(config, args):
    """Start the ASR server"""
    config.defrost()
-    config.data.manfiest = config.data.test_manifest
+    config.data.manifest = config.data.test_manifest
-    config.data.augmentation_config = ""
-    config.data.keep_transcription_text = True
    dataset = ManifestDataset.from_config(config)
-    model = DeepSpeech2Model.from_pretrained(dataset, config,
+    config.collator.augmentation_config = ""
+    config.collator.keep_transcription_text = True
+    config.collator.batch_size = 1
+    config.collator.num_workers = 0
+    collate_fn = SpeechCollator.from_config(config)
+    test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
+    model = DeepSpeech2Model.from_pretrained(test_loader, config,
                                             args.checkpoint_path)
    model.eval()
    # prepare ASR inference handler
    def file_to_transcript(filename):
-        feature = dataset.process_utterance(filename, "")
+        feature = test_loader.collate_fn.process_utterance(filename, "")
-        audio = np.array([feature[0]]).astype('float32')  #[1, D, T]
+        audio = np.array([feature[0]]).astype('float32')  #[1, T, D]
-        audio_len = feature[0].shape[1]
+        audio_len = feature[0].shape[0]
        audio_len = np.array([audio_len]).astype('int64')  # [1]
        result_transcript = model.decode(
            paddle.to_tensor(audio),
            paddle.to_tensor(audio_len),
-            vocab_list=dataset.vocab_list,
+            vocab_list=test_loader.collate_fn.vocab_list,
            decoding_method=config.decoding.decoding_method,
            lang_model_path=config.decoding.lang_model_path,
            beam_alpha=config.decoding.alpha,
@@ -138,7 +145,7 @@ if __name__ == "__main__":
    add_arg('host_ip',          str,
            'localhost',
            "Server's IP address.")
-    add_arg('host_port',        int,    8086,    "Server's IP port.")
+    add_arg('host_port',        int,    8089,    "Server's IP port.")
    add_arg('speech_save_dir',  str,
            'demo_cache',
            "Directory to save demo audios.")

--- a/deepspeech/exps/deepspeech2/bin/deploy/server.py
+++ b/deepspeech/exps/deepspeech2/bin/deploy/server.py
@@ -16,8 +16,10 @@ import functools
 import numpy as np
 import paddle
+from paddle.io import DataLoader
 from deepspeech.exps.deepspeech2.config import get_cfg_defaults
+from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.deepspeech2 import DeepSpeech2Model
 from deepspeech.training.cli import default_argument_parser
@@ -31,26 +33,35 @@ from deepspeech.utils.utility import print_arguments
 def start_server(config, args):
    """Start the ASR server"""
    config.defrost()
-    config.data.manfiest = config.data.test_manifest
+    config.data.manifest = config.data.test_manifest
-    config.data.augmentation_config = ""
-    config.data.keep_transcription_text = True
    dataset = ManifestDataset.from_config(config)
-    model = DeepSpeech2Model.from_pretrained(dataset, config,
+    config.collator.augmentation_config = ""
+    config.collator.keep_transcription_text = True
+    config.collator.batch_size = 1
+    config.collator.num_workers = 0
+    collate_fn = SpeechCollator.from_config(config)
+    test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
+    model = DeepSpeech2Model.from_pretrained(test_loader, config,
                                             args.checkpoint_path)
    model.eval()
    # prepare ASR inference handler
    def file_to_transcript(filename):
-        feature = dataset.process_utterance(filename, "")
+        feature = test_loader.collate_fn.process_utterance(filename, "")
-        audio = np.array([feature[0]]).astype('float32')  #[1, D, T]
+        audio = np.array([feature[0]]).astype('float32')  #[1, T, D]
-        audio_len = feature[0].shape[1]
+        # audio = audio.swapaxes(1,2)
+        print('---file_to_transcript feature----')
+        print(audio.shape)
+        audio_len = feature[0].shape[0]
+        print(audio_len)
        audio_len = np.array([audio_len]).astype('int64')  # [1]
        result_transcript = model.decode(
            paddle.to_tensor(audio),
            paddle.to_tensor(audio_len),
-            vocab_list=dataset.vocab_list,
+            vocab_list=test_loader.collate_fn.vocab_list,
            decoding_method=config.decoding.decoding_method,
            lang_model_path=config.decoding.lang_model_path,
            beam_alpha=config.decoding.alpha,
@@ -91,7 +102,7 @@ if __name__ == "__main__":
    add_arg('host_ip',          str,
            'localhost',
            "Server's IP address.")
-    add_arg('host_port',        int,    8086,    "Server's IP port.")
+    add_arg('host_port',        int,    8088,    "Server's IP port.")
    add_arg('speech_save_dir',  str,
            'demo_cache',
            "Directory to save demo audios.")

--- a/deepspeech/exps/deepspeech2/bin/tune.py
+++ b/deepspeech/exps/deepspeech2/bin/tune.py
@@ -47,7 +47,7 @@ def tune(config, args):
        drop_last=False,
        collate_fn=SpeechCollator(keep_transcription_text=True))
-    model = DeepSpeech2Model.from_pretrained(dev_dataset, config,
+    model = DeepSpeech2Model.from_pretrained(valid_loader, config,
                                             args.checkpoint_path)
    model.eval()

--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -318,7 +318,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
    def export(self):
        infer_model = DeepSpeech2InferModel.from_pretrained(
-            self.test_loader.dataset, self.config, self.args.checkpoint_path)
+            self.test_loader, self.config, self.args.checkpoint_path)
        infer_model.eval()
        feat_dim = self.test_loader.collate_fn.feature_size
        static_model = paddle.jit.to_static(

--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -506,7 +506,7 @@ class U2Tester(U2Trainer):
            List[paddle.static.InputSpec]: input spec.
        """
        from deepspeech.models.u2 import U2InferModel
-        infer_model = U2InferModel.from_pretrained(self.test_loader.dataset,
+        infer_model = U2InferModel.from_pretrained(self.test_loader,
                                                   self.config.model.clone(),
                                                   self.args.checkpoint_path)
        feat_dim = self.test_loader.collate_fn.feature_size

--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -242,6 +242,7 @@ class SpeechCollator():
        # specgram augment
        specgram = self._augmentation_pipeline.transform_feature(specgram)
+        specgram = specgram.transpose([1, 0])
        return specgram, transcript_part
    def __call__(self, batch):
@@ -269,7 +270,7 @@ class SpeechCollator():
            #utt
            utts.append(utt)
            # audio
-            audios.append(audio.T)  # [T, D]
+            audios.append(audio)  # [T, D]
            audio_lens.append(audio.shape[1])
            # text
            # for training, text is token ids

--- a/deepspeech/models/deepspeech2.py
+++ b/deepspeech/models/deepspeech2.py
@@ -198,11 +198,11 @@ class DeepSpeech2Model(nn.Layer):
            cutoff_top_n, num_processes)
    @classmethod
-    def from_pretrained(cls, dataset, config, checkpoint_path):
+    def from_pretrained(cls, dataloader, config, checkpoint_path):
        """Build a DeepSpeech2Model model from a pretrained model.
        Parameters
        ----------
-        dataset: paddle.io.Dataset
+        dataloader: paddle.io.DataLoader
        config: yacs.config.CfgNode
            model configs
@@ -215,8 +215,8 @@ class DeepSpeech2Model(nn.Layer):
        DeepSpeech2Model
            The model built from pretrained result.
        """
-        model = cls(feat_size=dataset.feature_size,
+        model = cls(feat_size=dataloader.collate_fn.feature_size,
-                    dict_size=dataset.vocab_size,
+                    dict_size=dataloader.collate_fn.vocab_size,
                    num_conv_layers=config.model.num_conv_layers,
                    num_rnn_layers=config.model.num_rnn_layers,
                    rnn_size=config.model.rnn_layer_size,

--- a/deepspeech/models/u2.py
+++ b/deepspeech/models/u2.py
@@ -876,11 +876,11 @@ class U2Model(U2BaseModel):
        return model
    @classmethod
-    def from_pretrained(cls, dataset, config, checkpoint_path):
+    def from_pretrained(cls, dataloader, config, checkpoint_path):
        """Build a DeepSpeech2Model model from a pretrained model.
        Args:
-            dataset (paddle.io.Dataset): not used.
+            dataloader (paddle.io.DataLoader): not used.
            config (yacs.config.CfgNode):  model configs
            checkpoint_path (Path or str): the path of pretrained model checkpoint, without extension name
@@ -888,8 +888,8 @@ class U2Model(U2BaseModel):
            DeepSpeech2Model: The model built from pretrained result.
        """
        config.defrost()
-        config.input_dim = dataset.feature_size
+        config.input_dim = dataloader.collate_fn.feature_size
-        config.output_dim = dataset.vocab_size
+        config.output_dim = dataloader.collate_fn.vocab_size
        config.freeze()
        model = cls.from_config(config)

--- a/deepspeech/utils/socket_server.py
+++ b/deepspeech/utils/socket_server.py
@@ -48,9 +48,9 @@ def warm_up_test(audio_process_handler,
    rng = random.Random(random_seed)
    samples = rng.sample(manifest, num_test_cases)
    for idx, sample in enumerate(samples):
-        print("Warm-up Test Case %d: %s", idx, sample['audio_filepath'])
+        print("Warm-up Test Case %d: %s" % (idx, sample['feat']))
        start_time = time.time()
-        transcript = audio_process_handler(sample['audio_filepath'])
+        transcript = audio_process_handler(sample['feat'])
        finish_time = time.time()
        print("Response Time: %f, Transcript: %s" %
              (finish_time - start_time, transcript))

--- a/examples/aishell/s0/README.md
+++ b/examples/aishell/s0/README.md
@@ -2,10 +2,10 @@
 ## Deepspeech2
-| Model | release | Config | Test set | Loss | CER |  
+| Model | Params | Release | Config | Test set | Loss | CER |  
-| --- | --- | --- | --- | --- | --- |  
+| --- | --- | --- | --- | --- | --- | --- |  
-| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382 ~ 0.073507|  
+| DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382,0.073507 |  
-| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 |  
+| DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 |  
-| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 |
+| DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 |
-| DeepSpeech2 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 |  
+| DeepSpeech2 | 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 |  
-| DeepSpeech2 58.4M | 1.8.5 | - | test | - | 0.080447 |  
+| DeepSpeech2 | 58.4M | 1.8.5 | - | test | - | 0.080447 |  
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@@ -10,8 +10,8 @@ data:
  min_output_input_ratio: 0.00
  max_output_input_ratio: .inf
 collator:
+  batch_size: 64 # one gpu
  mean_std_filepath: data/mean_std.json
  unit_type: char
  vocab_filepath: data/vocab.txt 
@@ -33,7 +33,6 @@ collator:
  sortagrad: True
  shuffle_method: batch_shuffle
  num_workers: 0
-  batch_size: 64 # one gpu
 model:
  num_conv_layers: 2

--- a/examples/aishell/s0/run.sh
+++ b/examples/aishell/s0/run.sh
@@ -31,10 +31,10 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # export ckpt avg_n
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+    CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
 fi
--- a/examples/aishell/s1/README.md
+++ b/examples/aishell/s1/README.md
@@ -9,6 +9,7 @@
 | conformer | 47.07M  | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 |  
 | conformer | 47.07M  | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 |  
 ## Chunk Conformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER |  

--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@@ -3,16 +3,21 @@ data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev-clean
  test_manifest: data/manifest.test-clean
-  mean_std_filepath: data/mean_std.json
-  vocab_filepath: data/vocab.txt 
-  augmentation_config: conf/augmentation.json
-  batch_size: 20
  min_input_len: 0.0
  max_input_len: 27.0 # second
  min_output_len: 0.0
  max_output_len: .inf
  min_output_input_ratio: 0.00
  max_output_input_ratio: .inf
+collator:
+  batch_size: 20
+  mean_std_filepath: data/mean_std.json
+  unit_type: char
+  vocab_filepath: data/vocab.txt 
+  augmentation_config: conf/augmentation.json
+  random_seed: 0
+  spm_model_prefix: 
  specgram_type: linear
  target_sample_rate: 16000
  max_freq: None