提交 7ec623f7 编写于 作者: H Hui Zhang

Merge branch 'develop' into align

...@@ -18,8 +18,10 @@ import numpy as np ...@@ -18,8 +18,10 @@ import numpy as np
import paddle import paddle
from paddle.inference import Config from paddle.inference import Config
from paddle.inference import create_predictor from paddle.inference import create_predictor
from paddle.io import DataLoader
from deepspeech.exps.deepspeech2.config import get_cfg_defaults from deepspeech.exps.deepspeech2.config import get_cfg_defaults
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset from deepspeech.io.dataset import ManifestDataset
from deepspeech.models.deepspeech2 import DeepSpeech2Model from deepspeech.models.deepspeech2 import DeepSpeech2Model
from deepspeech.training.cli import default_argument_parser from deepspeech.training.cli import default_argument_parser
...@@ -78,26 +80,31 @@ def inference(config, args): ...@@ -78,26 +80,31 @@ def inference(config, args):
def start_server(config, args): def start_server(config, args):
"""Start the ASR server""" """Start the ASR server"""
config.defrost() config.defrost()
config.data.manfiest = config.data.test_manifest config.data.manifest = config.data.test_manifest
config.data.augmentation_config = ""
config.data.keep_transcription_text = True
dataset = ManifestDataset.from_config(config) dataset = ManifestDataset.from_config(config)
model = DeepSpeech2Model.from_pretrained(dataset, config, config.collator.augmentation_config = ""
config.collator.keep_transcription_text = True
config.collator.batch_size = 1
config.collator.num_workers = 0
collate_fn = SpeechCollator.from_config(config)
test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
model = DeepSpeech2Model.from_pretrained(test_loader, config,
args.checkpoint_path) args.checkpoint_path)
model.eval() model.eval()
# prepare ASR inference handler # prepare ASR inference handler
def file_to_transcript(filename): def file_to_transcript(filename):
feature = dataset.process_utterance(filename, "") feature = test_loader.collate_fn.process_utterance(filename, "")
audio = np.array([feature[0]]).astype('float32') #[1, D, T] audio = np.array([feature[0]]).astype('float32') #[1, T, D]
audio_len = feature[0].shape[1] audio_len = feature[0].shape[0]
audio_len = np.array([audio_len]).astype('int64') # [1] audio_len = np.array([audio_len]).astype('int64') # [1]
result_transcript = model.decode( result_transcript = model.decode(
paddle.to_tensor(audio), paddle.to_tensor(audio),
paddle.to_tensor(audio_len), paddle.to_tensor(audio_len),
vocab_list=dataset.vocab_list, vocab_list=test_loader.collate_fn.vocab_list,
decoding_method=config.decoding.decoding_method, decoding_method=config.decoding.decoding_method,
lang_model_path=config.decoding.lang_model_path, lang_model_path=config.decoding.lang_model_path,
beam_alpha=config.decoding.alpha, beam_alpha=config.decoding.alpha,
...@@ -138,7 +145,7 @@ if __name__ == "__main__": ...@@ -138,7 +145,7 @@ if __name__ == "__main__":
add_arg('host_ip', str, add_arg('host_ip', str,
'localhost', 'localhost',
"Server's IP address.") "Server's IP address.")
add_arg('host_port', int, 8086, "Server's IP port.") add_arg('host_port', int, 8089, "Server's IP port.")
add_arg('speech_save_dir', str, add_arg('speech_save_dir', str,
'demo_cache', 'demo_cache',
"Directory to save demo audios.") "Directory to save demo audios.")
......
...@@ -16,8 +16,10 @@ import functools ...@@ -16,8 +16,10 @@ import functools
import numpy as np import numpy as np
import paddle import paddle
from paddle.io import DataLoader
from deepspeech.exps.deepspeech2.config import get_cfg_defaults from deepspeech.exps.deepspeech2.config import get_cfg_defaults
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset from deepspeech.io.dataset import ManifestDataset
from deepspeech.models.deepspeech2 import DeepSpeech2Model from deepspeech.models.deepspeech2 import DeepSpeech2Model
from deepspeech.training.cli import default_argument_parser from deepspeech.training.cli import default_argument_parser
...@@ -31,26 +33,35 @@ from deepspeech.utils.utility import print_arguments ...@@ -31,26 +33,35 @@ from deepspeech.utils.utility import print_arguments
def start_server(config, args): def start_server(config, args):
"""Start the ASR server""" """Start the ASR server"""
config.defrost() config.defrost()
config.data.manfiest = config.data.test_manifest config.data.manifest = config.data.test_manifest
config.data.augmentation_config = ""
config.data.keep_transcription_text = True
dataset = ManifestDataset.from_config(config) dataset = ManifestDataset.from_config(config)
model = DeepSpeech2Model.from_pretrained(dataset, config, config.collator.augmentation_config = ""
config.collator.keep_transcription_text = True
config.collator.batch_size = 1
config.collator.num_workers = 0
collate_fn = SpeechCollator.from_config(config)
test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
model = DeepSpeech2Model.from_pretrained(test_loader, config,
args.checkpoint_path) args.checkpoint_path)
model.eval() model.eval()
# prepare ASR inference handler # prepare ASR inference handler
def file_to_transcript(filename): def file_to_transcript(filename):
feature = dataset.process_utterance(filename, "") feature = test_loader.collate_fn.process_utterance(filename, "")
audio = np.array([feature[0]]).astype('float32') #[1, D, T] audio = np.array([feature[0]]).astype('float32') #[1, T, D]
audio_len = feature[0].shape[1] # audio = audio.swapaxes(1,2)
print('---file_to_transcript feature----')
print(audio.shape)
audio_len = feature[0].shape[0]
print(audio_len)
audio_len = np.array([audio_len]).astype('int64') # [1] audio_len = np.array([audio_len]).astype('int64') # [1]
result_transcript = model.decode( result_transcript = model.decode(
paddle.to_tensor(audio), paddle.to_tensor(audio),
paddle.to_tensor(audio_len), paddle.to_tensor(audio_len),
vocab_list=dataset.vocab_list, vocab_list=test_loader.collate_fn.vocab_list,
decoding_method=config.decoding.decoding_method, decoding_method=config.decoding.decoding_method,
lang_model_path=config.decoding.lang_model_path, lang_model_path=config.decoding.lang_model_path,
beam_alpha=config.decoding.alpha, beam_alpha=config.decoding.alpha,
...@@ -91,7 +102,7 @@ if __name__ == "__main__": ...@@ -91,7 +102,7 @@ if __name__ == "__main__":
add_arg('host_ip', str, add_arg('host_ip', str,
'localhost', 'localhost',
"Server's IP address.") "Server's IP address.")
add_arg('host_port', int, 8086, "Server's IP port.") add_arg('host_port', int, 8088, "Server's IP port.")
add_arg('speech_save_dir', str, add_arg('speech_save_dir', str,
'demo_cache', 'demo_cache',
"Directory to save demo audios.") "Directory to save demo audios.")
......
...@@ -47,7 +47,7 @@ def tune(config, args): ...@@ -47,7 +47,7 @@ def tune(config, args):
drop_last=False, drop_last=False,
collate_fn=SpeechCollator(keep_transcription_text=True)) collate_fn=SpeechCollator(keep_transcription_text=True))
model = DeepSpeech2Model.from_pretrained(dev_dataset, config, model = DeepSpeech2Model.from_pretrained(valid_loader, config,
args.checkpoint_path) args.checkpoint_path)
model.eval() model.eval()
......
...@@ -318,7 +318,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): ...@@ -318,7 +318,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
def export(self): def export(self):
infer_model = DeepSpeech2InferModel.from_pretrained( infer_model = DeepSpeech2InferModel.from_pretrained(
self.test_loader.dataset, self.config, self.args.checkpoint_path) self.test_loader, self.config, self.args.checkpoint_path)
infer_model.eval() infer_model.eval()
feat_dim = self.test_loader.collate_fn.feature_size feat_dim = self.test_loader.collate_fn.feature_size
static_model = paddle.jit.to_static( static_model = paddle.jit.to_static(
......
...@@ -574,15 +574,14 @@ class U2Tester(U2Trainer): ...@@ -574,15 +574,14 @@ class U2Tester(U2Trainer):
List[paddle.static.InputSpec]: input spec. List[paddle.static.InputSpec]: input spec.
""" """
from deepspeech.models.u2 import U2InferModel from deepspeech.models.u2 import U2InferModel
infer_model = U2InferModel.from_pretrained(self.test_loader.dataset, infer_model = U2InferModel.from_pretrained(self.test_loader,
self.config.model.clone(), self.config.model.clone(),
self.args.checkpoint_path) self.args.checkpoint_path)
feat_dim = self.test_loader.collate_fn.feature_size feat_dim = self.test_loader.collate_fn.feature_size
input_spec = [ input_spec = [
paddle.static.InputSpec( paddle.static.InputSpec(shape=[1, None, feat_dim],
shape=[None, feat_dim, None], dtype='float32'), # audio, [B,T,D]
dtype='float32'), # audio, [B,D,T] paddle.static.InputSpec(shape=[1],
paddle.static.InputSpec(shape=[None],
dtype='int64'), # audio_length, [B] dtype='int64'), # audio_length, [B]
] ]
return infer_model, input_spec return infer_model, input_spec
......
...@@ -154,8 +154,8 @@ class SpeechCollator(): ...@@ -154,8 +154,8 @@ class SpeechCollator():
random_seed (int, optional): for random generator. Defaults to 0. random_seed (int, optional): for random generator. Defaults to 0.
keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
if ``keep_transcription_text`` is False, text is token ids else is raw string. if ``keep_transcription_text`` is False, text is token ids else is raw string.
Do augmentations Do augmentations
Padding audio features with zeros to make them have the same shape (or Padding audio features with zeros to make them have the same shape (or
a user-defined shape) within one batch. a user-defined shape) within one batch.
""" """
...@@ -242,6 +242,7 @@ class SpeechCollator(): ...@@ -242,6 +242,7 @@ class SpeechCollator():
# specgram augment # specgram augment
specgram = self._augmentation_pipeline.transform_feature(specgram) specgram = self._augmentation_pipeline.transform_feature(specgram)
specgram = specgram.transpose([1, 0])
return specgram, transcript_part return specgram, transcript_part
def __call__(self, batch): def __call__(self, batch):
...@@ -269,8 +270,8 @@ class SpeechCollator(): ...@@ -269,8 +270,8 @@ class SpeechCollator():
#utt #utt
utts.append(utt) utts.append(utt)
# audio # audio
audios.append(audio.T) # [T, D] audios.append(audio) # [T, D]
audio_lens.append(audio.shape[1]) audio_lens.append(audio.shape[0])
# text # text
# for training, text is token ids # for training, text is token ids
# else text is string, convert to unicode ord # else text is string, convert to unicode ord
......
...@@ -198,11 +198,11 @@ class DeepSpeech2Model(nn.Layer): ...@@ -198,11 +198,11 @@ class DeepSpeech2Model(nn.Layer):
cutoff_top_n, num_processes) cutoff_top_n, num_processes)
@classmethod @classmethod
def from_pretrained(cls, dataset, config, checkpoint_path): def from_pretrained(cls, dataloader, config, checkpoint_path):
"""Build a DeepSpeech2Model model from a pretrained model. """Build a DeepSpeech2Model model from a pretrained model.
Parameters Parameters
---------- ----------
dataset: paddle.io.Dataset dataloader: paddle.io.DataLoader
config: yacs.config.CfgNode config: yacs.config.CfgNode
model configs model configs
...@@ -215,8 +215,8 @@ class DeepSpeech2Model(nn.Layer): ...@@ -215,8 +215,8 @@ class DeepSpeech2Model(nn.Layer):
DeepSpeech2Model DeepSpeech2Model
The model built from pretrained result. The model built from pretrained result.
""" """
model = cls(feat_size=dataset.feature_size, model = cls(feat_size=dataloader.collate_fn.feature_size,
dict_size=dataset.vocab_size, dict_size=dataloader.collate_fn.vocab_size,
num_conv_layers=config.model.num_conv_layers, num_conv_layers=config.model.num_conv_layers,
num_rnn_layers=config.model.num_rnn_layers, num_rnn_layers=config.model.num_rnn_layers,
rnn_size=config.model.rnn_layer_size, rnn_size=config.model.rnn_layer_size,
......
...@@ -876,11 +876,11 @@ class U2Model(U2BaseModel): ...@@ -876,11 +876,11 @@ class U2Model(U2BaseModel):
return model return model
@classmethod @classmethod
def from_pretrained(cls, dataset, config, checkpoint_path): def from_pretrained(cls, dataloader, config, checkpoint_path):
"""Build a DeepSpeech2Model model from a pretrained model. """Build a DeepSpeech2Model model from a pretrained model.
Args: Args:
dataset (paddle.io.Dataset): not used. dataloader (paddle.io.DataLoader): not used.
config (yacs.config.CfgNode): model configs config (yacs.config.CfgNode): model configs
checkpoint_path (Path or str): the path of pretrained model checkpoint, without extension name checkpoint_path (Path or str): the path of pretrained model checkpoint, without extension name
...@@ -888,8 +888,8 @@ class U2Model(U2BaseModel): ...@@ -888,8 +888,8 @@ class U2Model(U2BaseModel):
DeepSpeech2Model: The model built from pretrained result. DeepSpeech2Model: The model built from pretrained result.
""" """
config.defrost() config.defrost()
config.input_dim = dataset.feature_size config.input_dim = dataloader.collate_fn.feature_size
config.output_dim = dataset.vocab_size config.output_dim = dataloader.collate_fn.vocab_size
config.freeze() config.freeze()
model = cls.from_config(config) model = cls.from_config(config)
......
...@@ -48,9 +48,9 @@ def warm_up_test(audio_process_handler, ...@@ -48,9 +48,9 @@ def warm_up_test(audio_process_handler,
rng = random.Random(random_seed) rng = random.Random(random_seed)
samples = rng.sample(manifest, num_test_cases) samples = rng.sample(manifest, num_test_cases)
for idx, sample in enumerate(samples): for idx, sample in enumerate(samples):
print("Warm-up Test Case %d: %s", idx, sample['audio_filepath']) print("Warm-up Test Case %d: %s" % (idx, sample['feat']))
start_time = time.time() start_time = time.time()
transcript = audio_process_handler(sample['audio_filepath']) transcript = audio_process_handler(sample['feat'])
finish_time = time.time() finish_time = time.time()
print("Response Time: %f, Transcript: %s" % print("Response Time: %f, Transcript: %s" %
(finish_time - start_time, transcript)) (finish_time - start_time, transcript))
......
...@@ -2,10 +2,10 @@ ...@@ -2,10 +2,10 @@
## Deepspeech2 ## Deepspeech2
| Model | release | Config | Test set | Loss | CER | | Model | Params | Release | Config | Test set | Loss | CER |
| --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- |
| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382 | | DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382,0.073507 |
| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 | | DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 |
| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 | | DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 |
| DeepSpeech2 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 | | DeepSpeech2 | 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 |
| DeepSpeech2 58.4M | 1.8.5 | - | test | - | 0.080447 | | DeepSpeech2 | 58.4M | 1.8.5 | - | test | - | 0.080447 |
...@@ -10,8 +10,8 @@ data: ...@@ -10,8 +10,8 @@ data:
min_output_input_ratio: 0.00 min_output_input_ratio: 0.00
max_output_input_ratio: .inf max_output_input_ratio: .inf
collator: collator:
batch_size: 64 # one gpu
mean_std_filepath: data/mean_std.json mean_std_filepath: data/mean_std.json
unit_type: char unit_type: char
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
...@@ -33,7 +33,6 @@ collator: ...@@ -33,7 +33,6 @@ collator:
sortagrad: True sortagrad: True
shuffle_method: batch_shuffle shuffle_method: batch_shuffle
num_workers: 0 num_workers: 0
batch_size: 64 # one gpu
model: model:
num_conv_layers: 2 num_conv_layers: 2
......
...@@ -31,10 +31,10 @@ fi ...@@ -31,10 +31,10 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# export ckpt avg_n # export ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi fi
...@@ -2,25 +2,26 @@ ...@@ -2,25 +2,26 @@
## Conformer ## Conformer
| Model | Config | Augmentation| Test set | Decode method | Loss | WER | | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 | | conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 |
| conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 | | conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 |
| conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 | | conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 |
| conformer | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 | | conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 |
## Chunk Conformer ## Chunk Conformer
| Model | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER | | Model | Params | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 |
| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 |
| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 |
| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 | - | 0.059400 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 | - | 0.059400 |
## Transformer ## Transformer
| Model | Config | Augmentation| Test set | Decode method | Loss | WER | | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | ---| | --- | --- | --- | --- | --- | --- | --- | ---|
| transformer | conf/transformer.yaml | spec_aug + shift | test | attention | - | - | | transformer | - | conf/transformer.yaml | spec_aug + shift | test | attention | - | - |
...@@ -60,7 +60,7 @@ def create_manifest(data_dir, manifest_path_prefix): ...@@ -60,7 +60,7 @@ def create_manifest(data_dir, manifest_path_prefix):
if line == '': if line == '':
continue continue
audio_id, text = line.split(' ', 1) audio_id, text = line.split(' ', 1)
# remove withespace # remove withespace, charactor text
text = ''.join(text.split()) text = ''.join(text.split())
transcript_dict[audio_id] = text transcript_dict[audio_id] = text
...@@ -123,6 +123,8 @@ def main(): ...@@ -123,6 +123,8 @@ def main():
target_dir=args.target_dir, target_dir=args.target_dir,
manifest_path=args.manifest_prefix) manifest_path=args.manifest_prefix)
print("Data download and manifest prepare done!")
if __name__ == '__main__': if __name__ == '__main__':
main() main()
*.tgz
manifest.*
data_thchs30
resource
test-noise
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Prepare THCHS-30 mandarin dataset
Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
import argparse
import codecs
import json
import os
from multiprocessing.pool import Pool
from pathlib import Path
import soundfile
from utils.utility import download
from utils.utility import unpack
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
URL_ROOT = 'http://www.openslr.org/resources/18'
# URL_ROOT = 'https://openslr.magicdatatech.com/resources/18'
DATA_URL = URL_ROOT + '/data_thchs30.tgz'
TEST_NOISE_URL = URL_ROOT + '/test-noise.tgz'
RESOURCE_URL = URL_ROOT + '/resource.tgz'
MD5_DATA = '2d2252bde5c8429929e1841d4cb95e90'
MD5_TEST_NOISE = '7e8a985fb965b84141b68c68556c2030'
MD5_RESOURCE = 'c0b2a565b4970a0c4fe89fefbf2d97e1'
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--target_dir",
default=DATA_HOME + "/THCHS30",
type=str,
help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
"--manifest_prefix",
default="manifest",
type=str,
help="Filepath prefix for output manifests. (default: %(default)s)")
args = parser.parse_args()
def read_trn(filepath):
"""read trn file.
word text in first line.
syllable text in second line.
phoneme text in third line.
Args:
filepath (str): trn path.
Returns:
list(str): (word, syllable, phone)
"""
texts = []
with open(filepath, 'r') as f:
lines = f.read().split('\n')
# last line is `empty`
lines = lines[:3]
assert len(lines) == 3, lines
# charactor text, remove withespace
texts.append(''.join(lines[0].split()))
texts.extend(lines[1:])
return texts
def resolve_symlink(filepath):
"""resolve symlink which content is norm file.
Args:
filepath (str): norm file symlink.
"""
sym_path = Path(filepath)
relative_link = sym_path.read_text().strip()
relative = Path(relative_link)
relpath = sym_path.parent / relative
return relpath.resolve()
def create_manifest(data_dir, manifest_path_prefix):
print("Creating manifest %s ..." % manifest_path_prefix)
json_lines = []
data_types = ['train', 'dev', 'test']
for dtype in data_types:
del json_lines[:]
audio_dir = os.path.join(data_dir, dtype)
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for fname in filelist:
file_path = os.path.join(subfolder, fname)
if file_path.endswith('.wav'):
audio_path = os.path.abspath(file_path)
text_path = resolve_symlink(audio_path + '.trn')
else:
continue
assert os.path.exists(audio_path) and os.path.exists(text_path)
audio_id = os.path.basename(audio_path)[:-4]
word_text, syllable_text, phone_text = read_trn(text_path)
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
json_lines.append(
json.dumps(
{
'utt': audio_id,
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': word_text,
'syllable': syllable_text,
'phone': phone_text,
},
ensure_ascii=False))
manifest_path = manifest_path_prefix + '.' + dtype
with codecs.open(manifest_path, 'w', 'utf-8') as fout:
for line in json_lines:
fout.write(line + '\n')
def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
"""Download, unpack and create manifest file."""
datadir = os.path.join(target_dir, subset)
if not os.path.exists(datadir):
filepath = download(url, md5sum, target_dir)
unpack(filepath, target_dir)
else:
print("Skip downloading and unpacking. Data already exists in %s." %
target_dir)
if subset == 'data_thchs30':
create_manifest(datadir, manifest_path)
def main():
if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir)
tasks = [
(DATA_URL, MD5_DATA, args.target_dir, args.manifest_prefix,
"data_thchs30"),
(TEST_NOISE_URL, MD5_TEST_NOISE, args.target_dir, args.manifest_prefix,
"test-noise"),
(RESOURCE_URL, MD5_RESOURCE, args.target_dir, args.manifest_prefix,
"resource"),
]
with Pool(7) as pool:
pool.starmap(prepare_dataset, tasks)
print("Data download and manifest prepare done!")
if __name__ == '__main__':
main()
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
## Deepspeech2 ## Deepspeech2
| Model | release | Config | Test set | Loss | WER | | Model | Params | release | Config | Test set | Loss | WER |
| --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- |
| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 | | DeepSpeech2 | 42.96M | 2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 |
| DeepSpeech2 | 2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 | | DeepSpeech2 | 42.96M | 2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 |
| DeepSpeech2 | 1.8.5 | - | test-clean | - | 0.074939 | | DeepSpeech2 | 42.96M | 1.8.5 | - | test-clean | - | 0.074939 |
...@@ -3,16 +3,21 @@ data: ...@@ -3,16 +3,21 @@ data:
train_manifest: data/manifest.train train_manifest: data/manifest.train
dev_manifest: data/manifest.dev-clean dev_manifest: data/manifest.dev-clean
test_manifest: data/manifest.test-clean test_manifest: data/manifest.test-clean
mean_std_filepath: data/mean_std.json
vocab_filepath: data/vocab.txt
augmentation_config: conf/augmentation.json
batch_size: 20
min_input_len: 0.0 min_input_len: 0.0
max_input_len: 27.0 # second max_input_len: 27.0 # second
min_output_len: 0.0 min_output_len: 0.0
max_output_len: .inf max_output_len: .inf
min_output_input_ratio: 0.00 min_output_input_ratio: 0.00
max_output_input_ratio: .inf max_output_input_ratio: .inf
collator:
batch_size: 20
mean_std_filepath: data/mean_std.json
unit_type: char
vocab_filepath: data/vocab.txt
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
specgram_type: linear specgram_type: linear
target_sample_rate: 16000 target_sample_rate: 16000
max_freq: None max_freq: None
......
...@@ -2,17 +2,17 @@ ...@@ -2,17 +2,17 @@
## Conformer ## Conformer
| Model | Config | Augmentation| Test set | Decode method | Loss | WER | | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | conf/conformer.yaml | spec_aug + shift | test-all | attention | test-all 6.35 | 0.057117 | | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-all | attention | 6.35 | 0.057117 |
| conformer | conf/conformer.yaml | spec_aug + shift | test-clean | attention | test-all 6.35 | 0.030162 | | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention | 6.35 | 0.030162 |
| conformer | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | test-all 6.35 | 0.037910 | | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 6.35 | 0.037910 |
| conformer | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | test-all 6.35 | 0.037761 | | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 6.35 | 0.037761 |
| conformer | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | test-all 6.35 | 0.032115 | | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 6.35 | 0.032115 |
## Transformer ## Transformer
| Model | Config | Augmentation| Test set | Decode method | Loss | WER | | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- | --- |
| transformer | conf/transformer.yaml | spec_aug + shift | test-all | attention | test-all 6.98 | 0.066500 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-all | attention | 6.98 | 0.066500 |
| transformer | conf/transformer.yaml | spec_aug + shift | test-clean | attention | test-all 6.98 | 0.036 | | transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 6.98 | 0.036 |
...@@ -3,18 +3,20 @@ data: ...@@ -3,18 +3,20 @@ data:
train_manifest: data/manifest.train train_manifest: data/manifest.train
dev_manifest: data/manifest.dev dev_manifest: data/manifest.dev
test_manifest: data/manifest.test test_manifest: data/manifest.test
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/augmentation.json
batch_size: 4
min_input_len: 0.5 min_input_len: 0.5
max_input_len: 20.0 max_input_len: 20.0
min_output_len: 0.0 min_output_len: 0.0
max_output_len: 400.0 max_output_len: 400.0
min_output_input_ratio: 0.05 min_output_input_ratio: 0.05
max_output_input_ratio: 10.0 max_output_input_ratio: 10.0
collator:
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/augmentation.json
batch_size: 16
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank specgram_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
...@@ -80,7 +82,7 @@ model: ...@@ -80,7 +82,7 @@ model:
training: training:
n_epoch: 120 n_epoch: 120
accum_grad: 1 accum_grad: 8
global_grad_clip: 5.0 global_grad_clip: 5.0
optim: adam optim: adam
optim_conf: optim_conf:
......
...@@ -3,18 +3,20 @@ data: ...@@ -3,18 +3,20 @@ data:
train_manifest: data/manifest.train train_manifest: data/manifest.train
dev_manifest: data/manifest.dev dev_manifest: data/manifest.dev
test_manifest: data/manifest.test test_manifest: data/manifest.test
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/augmentation.json
batch_size: 64
min_input_len: 0.5 # second min_input_len: 0.5 # second
max_input_len: 20.0 # second max_input_len: 20.0 # second
min_output_len: 0.0 # tokens min_output_len: 0.0 # tokens
max_output_len: 400.0 # tokens max_output_len: 400.0 # tokens
min_output_input_ratio: 0.05 min_output_input_ratio: 0.05
max_output_input_ratio: 10.0 max_output_input_ratio: 10.0
collator:
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/augmentation.json
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank specgram_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
...@@ -103,6 +105,6 @@ decoding: ...@@ -103,6 +105,6 @@ decoding:
# >0: for decoding, use fixed chunk size as set. # >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here. # 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False. simulate_streaming: true # simulate streaming inference. Defaults to False.
...@@ -3,18 +3,20 @@ data: ...@@ -3,18 +3,20 @@ data:
train_manifest: data/manifest.train train_manifest: data/manifest.train
dev_manifest: data/manifest.dev dev_manifest: data/manifest.dev
test_manifest: data/manifest.test-clean test_manifest: data/manifest.test-clean
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/augmentation.json
batch_size: 16
min_input_len: 0.5 # seconds min_input_len: 0.5 # seconds
max_input_len: 20.0 # seconds max_input_len: 20.0 # seconds
min_output_len: 0.0 # tokens min_output_len: 0.0 # tokens
max_output_len: 400.0 # tokens max_output_len: 400.0 # tokens
min_output_input_ratio: 0.05 min_output_input_ratio: 0.05
max_output_input_ratio: 10.0 max_output_input_ratio: 10.0
collator:
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/augmentation.json
batch_size: 16
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank specgram_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
......
...@@ -3,18 +3,20 @@ data: ...@@ -3,18 +3,20 @@ data:
train_manifest: data/manifest.train train_manifest: data/manifest.train
dev_manifest: data/manifest.dev dev_manifest: data/manifest.dev
test_manifest: data/manifest.test-clean test_manifest: data/manifest.test-clean
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/augmentation.json
batch_size: 64
min_input_len: 0.5 # second min_input_len: 0.5 # second
max_input_len: 20.0 # second max_input_len: 20.0 # second
min_output_len: 0.0 # tokens min_output_len: 0.0 # tokens
max_output_len: 400.0 # tokens max_output_len: 400.0 # tokens
min_output_input_ratio: 0.05 min_output_input_ratio: 0.05
max_output_input_ratio: 10.0 max_output_input_ratio: 10.0
collator:
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/augmentation.json
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank specgram_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
......
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(deepspeech VERSION 0.1)
set(CMAKE_VERBOSE_MAKEFILE on)
# set std-14
set(CMAKE_CXX_STANDARD 14)
# include file
include(FetchContent)
include(ExternalProject)
# fc_patch dir
set(FETCHCONTENT_QUIET off)
get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
set(FETCHCONTENT_BASE_DIR ${fc_patch})
###############################################################################
# Option Configurations
###############################################################################
# option configurations
option(TEST_DEBUG "option for debug" OFF)
###############################################################################
# Include third party
###############################################################################
# #example for include third party
# FetchContent_Declare()
# # FetchContent_MakeAvailable was not added until CMake 3.14
# FetchContent_MakeAvailable()
# include_directories()
# ABSEIL-CPP
include(FetchContent)
FetchContent_Declare(
absl
GIT_REPOSITORY "https://github.com/abseil/abseil-cpp.git"
GIT_TAG "20210324.1"
)
FetchContent_MakeAvailable(absl)
# libsndfile
include(FetchContent)
FetchContent_Declare(
libsndfile
GIT_REPOSITORY "https://github.com/libsndfile/libsndfile.git"
GIT_TAG "1.0.31"
)
FetchContent_MakeAvailable(libsndfile)
###############################################################################
# Add local library
###############################################################################
# system lib
find_package()
# if dir have CmakeLists.txt
add_subdirectory()
# if dir do not have CmakeLists.txt
add_library(lib_name STATIC file.cc)
target_link_libraries(lib_name item0 item1)
add_dependencies(lib_name depend-target)
###############################################################################
# Library installation
###############################################################################
install()
###############################################################################
# Build binary file
###############################################################################
add_executable()
target_link_libraries()
aux_source_directory(. DIR_LIB_SRCS)
add_library(decoder STATIC ${DIR_LIB_SRCS})
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册