提交 8e73d184 编写于 作者: H Hui Zhang

tiny/s0/s1 can run all

上级 6f83be1a
...@@ -18,8 +18,10 @@ import numpy as np ...@@ -18,8 +18,10 @@ import numpy as np
import paddle import paddle
from paddle.inference import Config from paddle.inference import Config
from paddle.inference import create_predictor from paddle.inference import create_predictor
from paddle.io import DataLoader
from deepspeech.exps.deepspeech2.config import get_cfg_defaults from deepspeech.exps.deepspeech2.config import get_cfg_defaults
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset from deepspeech.io.dataset import ManifestDataset
from deepspeech.models.ds2 import DeepSpeech2Model from deepspeech.models.ds2 import DeepSpeech2Model
from deepspeech.training.cli import default_argument_parser from deepspeech.training.cli import default_argument_parser
...@@ -78,26 +80,31 @@ def inference(config, args): ...@@ -78,26 +80,31 @@ def inference(config, args):
def start_server(config, args): def start_server(config, args):
"""Start the ASR server""" """Start the ASR server"""
config.defrost() config.defrost()
config.data.manfiest = config.data.test_manifest config.data.manifest = config.data.test_manifest
config.data.augmentation_config = ""
config.data.keep_transcription_text = True
dataset = ManifestDataset.from_config(config) dataset = ManifestDataset.from_config(config)
model = DeepSpeech2Model.from_pretrained(dataset, config, config.collator.augmentation_config = ""
config.collator.keep_transcription_text = True
config.collator.batch_size = 1
config.collator.num_workers = 0
collate_fn = SpeechCollator.from_config(config)
test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
model = DeepSpeech2Model.from_pretrained(test_loader, config,
args.checkpoint_path) args.checkpoint_path)
model.eval() model.eval()
# prepare ASR inference handler # prepare ASR inference handler
def file_to_transcript(filename): def file_to_transcript(filename):
feature = dataset.process_utterance(filename, "") feature = test_loader.collate_fn.process_utterance(filename, "")
audio = np.array([feature[0]]).astype('float32') #[1, D, T] audio = np.array([feature[0]]).astype('float32') #[1, T, D]
audio_len = feature[0].shape[1] audio_len = feature[0].shape[0]
audio_len = np.array([audio_len]).astype('int64') # [1] audio_len = np.array([audio_len]).astype('int64') # [1]
result_transcript = model.decode( result_transcript = model.decode(
paddle.to_tensor(audio), paddle.to_tensor(audio),
paddle.to_tensor(audio_len), paddle.to_tensor(audio_len),
vocab_list=dataset.vocab_list, vocab_list=test_loader.collate_fn.vocab_list,
decoding_method=config.decoding.decoding_method, decoding_method=config.decoding.decoding_method,
lang_model_path=config.decoding.lang_model_path, lang_model_path=config.decoding.lang_model_path,
beam_alpha=config.decoding.alpha, beam_alpha=config.decoding.alpha,
...@@ -138,7 +145,7 @@ if __name__ == "__main__": ...@@ -138,7 +145,7 @@ if __name__ == "__main__":
add_arg('host_ip', str, add_arg('host_ip', str,
'localhost', 'localhost',
"Server's IP address.") "Server's IP address.")
add_arg('host_port', int, 8086, "Server's IP port.") add_arg('host_port', int, 8089, "Server's IP port.")
add_arg('speech_save_dir', str, add_arg('speech_save_dir', str,
'demo_cache', 'demo_cache',
"Directory to save demo audios.") "Directory to save demo audios.")
......
...@@ -16,8 +16,10 @@ import functools ...@@ -16,8 +16,10 @@ import functools
import numpy as np import numpy as np
import paddle import paddle
from paddle.io import DataLoader
from deepspeech.exps.deepspeech2.config import get_cfg_defaults from deepspeech.exps.deepspeech2.config import get_cfg_defaults
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset from deepspeech.io.dataset import ManifestDataset
from deepspeech.models.ds2 import DeepSpeech2Model from deepspeech.models.ds2 import DeepSpeech2Model
from deepspeech.training.cli import default_argument_parser from deepspeech.training.cli import default_argument_parser
...@@ -31,26 +33,35 @@ from deepspeech.utils.utility import print_arguments ...@@ -31,26 +33,35 @@ from deepspeech.utils.utility import print_arguments
def start_server(config, args): def start_server(config, args):
"""Start the ASR server""" """Start the ASR server"""
config.defrost() config.defrost()
config.data.manfiest = config.data.test_manifest config.data.manifest = config.data.test_manifest
config.data.augmentation_config = ""
config.data.keep_transcription_text = True
dataset = ManifestDataset.from_config(config) dataset = ManifestDataset.from_config(config)
model = DeepSpeech2Model.from_pretrained(dataset, config, config.collator.augmentation_config = ""
config.collator.keep_transcription_text = True
config.collator.batch_size = 1
config.collator.num_workers = 0
collate_fn = SpeechCollator.from_config(config)
test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
model = DeepSpeech2Model.from_pretrained(test_loader, config,
args.checkpoint_path) args.checkpoint_path)
model.eval() model.eval()
# prepare ASR inference handler # prepare ASR inference handler
def file_to_transcript(filename): def file_to_transcript(filename):
feature = dataset.process_utterance(filename, "") feature = test_loader.collate_fn.process_utterance(filename, "")
audio = np.array([feature[0]]).astype('float32') #[1, D, T] audio = np.array([feature[0]]).astype('float32') #[1, T, D]
audio_len = feature[0].shape[1] # audio = audio.swapaxes(1,2)
print('---file_to_transcript feature----')
print(audio.shape)
audio_len = feature[0].shape[0]
print(audio_len)
audio_len = np.array([audio_len]).astype('int64') # [1] audio_len = np.array([audio_len]).astype('int64') # [1]
result_transcript = model.decode( result_transcript = model.decode(
paddle.to_tensor(audio), paddle.to_tensor(audio),
paddle.to_tensor(audio_len), paddle.to_tensor(audio_len),
vocab_list=dataset.vocab_list, vocab_list=test_loader.collate_fn.vocab_list,
decoding_method=config.decoding.decoding_method, decoding_method=config.decoding.decoding_method,
lang_model_path=config.decoding.lang_model_path, lang_model_path=config.decoding.lang_model_path,
beam_alpha=config.decoding.alpha, beam_alpha=config.decoding.alpha,
...@@ -91,7 +102,7 @@ if __name__ == "__main__": ...@@ -91,7 +102,7 @@ if __name__ == "__main__":
add_arg('host_ip', str, add_arg('host_ip', str,
'localhost', 'localhost',
"Server's IP address.") "Server's IP address.")
add_arg('host_port', int, 8086, "Server's IP port.") add_arg('host_port', int, 8088, "Server's IP port.")
add_arg('speech_save_dir', str, add_arg('speech_save_dir', str,
'demo_cache', 'demo_cache',
"Directory to save demo audios.") "Directory to save demo audios.")
......
...@@ -12,26 +12,37 @@ ...@@ -12,26 +12,37 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Contains DeepSpeech2 model.""" """Contains DeepSpeech2 model."""
import os
import time import time
from collections import defaultdict from collections import defaultdict
from contextlib import nullcontext
from pathlib import Path from pathlib import Path
from typing import Optional
import jsonlines
import numpy as np import numpy as np
import paddle import paddle
from paddle import distributed as dist from paddle import distributed as dist
from paddle import inference
from paddle.io import DataLoader from paddle.io import DataLoader
from yacs.config import CfgNode
from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
from deepspeech.io.collator import SpeechCollator from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset from deepspeech.io.dataset import ManifestDataset
from deepspeech.io.sampler import SortagradBatchSampler from deepspeech.io.sampler import SortagradBatchSampler
from deepspeech.io.sampler import SortagradDistributedBatchSampler from deepspeech.io.sampler import SortagradDistributedBatchSampler
from deepspeech.models.ds2 import DeepSpeech2InferModel from deepspeech.models.ds2 import DeepSpeech2InferModel
from deepspeech.models.ds2 import DeepSpeech2Model from deepspeech.models.ds2 import DeepSpeech2Model
from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline
from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
from deepspeech.training.reporter import report
from deepspeech.training.trainer import Trainer from deepspeech.training.trainer import Trainer
from deepspeech.utils import error_rate from deepspeech.utils import error_rate
from deepspeech.utils import layer_tools from deepspeech.utils import layer_tools
from deepspeech.utils import mp_tools from deepspeech.utils import mp_tools
from deepspeech.utils.log import Autolog
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
from deepspeech.utils.utility import UpdateConfig from deepspeech.utils.utility import UpdateConfig
...@@ -42,9 +53,9 @@ class DeepSpeech2Trainer(Trainer): ...@@ -42,9 +53,9 @@ class DeepSpeech2Trainer(Trainer):
def __init__(self, config, args): def __init__(self, config, args):
super().__init__(config, args) super().__init__(config, args)
def train_batch(self, batch_index, batch_data, msg): def train_batch(self, batch_index, batch, msg):
start = time.time() start = time.time()
loss = self.model(*batch_data) loss = self.model(*batch)
loss.backward() loss.backward()
layer_tools.print_grads(self.model, print_func=None) layer_tools.print_grads(self.model, print_func=None)
self.optimizer.step() self.optimizer.step()
...@@ -176,7 +187,7 @@ class DeepSpeech2Trainer(Trainer): ...@@ -176,7 +187,7 @@ class DeepSpeech2Trainer(Trainer):
sortagrad=config.data.sortagrad, sortagrad=config.data.sortagrad,
shuffle_method=config.data.shuffle_method) shuffle_method=config.data.shuffle_method)
collate_fn = SpeechCollator(keep_transcription_text=False) collate_fn = SpeechCollator(keep_transcription_text=False, return_utts=False)
self.train_loader = DataLoader( self.train_loader = DataLoader(
train_dataset, train_dataset,
batch_sampler=batch_sampler, batch_sampler=batch_sampler,
...@@ -190,10 +201,55 @@ class DeepSpeech2Trainer(Trainer): ...@@ -190,10 +201,55 @@ class DeepSpeech2Trainer(Trainer):
collate_fn=collate_fn) collate_fn=collate_fn)
logger.info("Setup train/valid Dataloader!") logger.info("Setup train/valid Dataloader!")
config.data.manifest = config.data.test_manifest
config.data.keep_transcription_text = True
config.data.augmentation_config = ""
# filter test examples, will cause less examples, but no mismatch with training
# and can use large batch size , save training time, so filter test egs now.
# config.data.min_input_len = 0.0 # second
# config.data.max_input_len = float('inf') # second
# config.data.min_output_len = 0.0 # tokens
# config.data.max_output_len = float('inf') # tokens
# config.data.min_output_input_ratio = 0.00
# config.data.max_output_input_ratio = float('inf')
test_dataset = ManifestDataset.from_config(config)
# return text ord id
self.test_loader = DataLoader(
test_dataset,
batch_size=config.decoding.batch_size,
shuffle=False,
drop_last=False,
collate_fn=SpeechCollator(keep_transcription_text=True, return_utts=True))
logger.info("Setup test Dataloader!")
class DeepSpeech2Tester(DeepSpeech2Trainer): class DeepSpeech2Tester(DeepSpeech2Trainer):
@classmethod
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
# testing config
default = CfgNode(
dict(
alpha=2.5, # Coef of LM for beam search.
beta=0.3, # Coef of WC for beam search.
cutoff_prob=1.0, # Cutoff probability for pruning.
cutoff_top_n=40, # Cutoff number for pruning.
lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model.
decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy
error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer'
num_proc_bsearch=8, # # of CPUs for beam search.
beam_size=500, # Beam search width.
batch_size=128, # decoding batch size
))
if config is not None:
config.merge_from_other_cfg(default)
return default
def __init__(self, config, args): def __init__(self, config, args):
super().__init__(config, args) super().__init__(config, args)
self._text_featurizer = TextFeaturizer(
unit_type=config.data.unit_type, vocab_filepath=None)
def ordid2token(self, texts, texts_len): def ordid2token(self, texts, texts_len):
""" ord() id to chr() chr """ """ ord() id to chr() chr """
...@@ -204,15 +260,10 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): ...@@ -204,15 +260,10 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
trans.append(''.join([chr(i) for i in ids])) trans.append(''.join([chr(i) for i in ids]))
return trans return trans
def compute_metrics(self, audio, audio_len, texts, texts_len): def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
cfg = self.config.decoding self.autolog.times.start()
errors_sum, len_refs, num_ins = 0.0, 0, 0 self.autolog.times.stamp()
errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
vocab_list = self.test_loader.dataset.vocab_list
target_transcripts = self.ordid2token(texts, texts_len)
result_transcripts = self.model.decode( result_transcripts = self.model.decode(
audio, audio,
audio_len, audio_len,
...@@ -225,14 +276,48 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): ...@@ -225,14 +276,48 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
cutoff_prob=cfg.cutoff_prob, cutoff_prob=cfg.cutoff_prob,
cutoff_top_n=cfg.cutoff_top_n, cutoff_top_n=cfg.cutoff_top_n,
num_processes=cfg.num_proc_bsearch) num_processes=cfg.num_proc_bsearch)
#replace the <space> with ' '
result_transcripts = [
self._text_featurizer.detokenize(sentence)
for sentence in result_transcripts
]
self.autolog.times.stamp()
self.autolog.times.stamp()
self.autolog.times.end()
return result_transcripts
def compute_metrics(self,
utts,
audio,
audio_len,
texts,
texts_len,
fout=None):
cfg = self.config.decoding
errors_sum, len_refs, num_ins = 0.0, 0, 0
errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
#vocab_list = self.test_loader.collate_fn.vocab_list
vocab_list = self.test_loader.dataset.vocab_list
target_transcripts = self.ordid2token(texts, texts_len)
for target, result in zip(target_transcripts, result_transcripts): result_transcripts = self.compute_result_transcripts(audio, audio_len,
vocab_list, cfg)
for utt, target, result in zip(utts, target_transcripts,
result_transcripts):
errors, len_ref = errors_func(target, result) errors, len_ref = errors_func(target, result)
errors_sum += errors errors_sum += errors
len_refs += len_ref len_refs += len_ref
num_ins += 1 num_ins += 1
logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" % if fout:
(target, result)) fout.write({"utt": utt, "ref": target, "hyp": result})
logger.info(f"Utt: {utt}")
logger.info(f"Ref: {target}")
logger.info(f"Hyp: {result}")
logger.info("Current error rate [%s] = %f" % logger.info("Current error rate [%s] = %f" %
(cfg.error_rate_type, error_rate_func(target, result))) (cfg.error_rate_type, error_rate_func(target, result)))
...@@ -247,13 +332,19 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): ...@@ -247,13 +332,19 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
@paddle.no_grad() @paddle.no_grad()
def test(self): def test(self):
logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
self.autolog = Autolog(
batch_size=self.config.decoding.batch_size,
model_name="deepspeech2",
model_precision="fp32").getlog()
self.model.eval() self.model.eval()
cfg = self.config cfg = self.config
error_rate_type = None error_rate_type = None
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
with jsonlines.open(self.args.result_file, 'w') as fout:
for i, batch in enumerate(self.test_loader): for i, batch in enumerate(self.test_loader):
metrics = self.compute_metrics(*batch) audio, audio_len, texts, texts_len, utts = batch
metrics = self.compute_metrics(utts, audio, audio_len, texts,
texts_len, fout)
errors_sum += metrics['errors_sum'] errors_sum += metrics['errors_sum']
len_refs += metrics['len_refs'] len_refs += metrics['len_refs']
num_ins += metrics['num_ins'] num_ins += metrics['num_ins']
...@@ -268,101 +359,234 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): ...@@ -268,101 +359,234 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
msg += "Final error rate [%s] (%d/%d) = %f" % ( msg += "Final error rate [%s] (%d/%d) = %f" % (
error_rate_type, num_ins, num_ins, errors_sum / len_refs) error_rate_type, num_ins, num_ins, errors_sum / len_refs)
logger.info(msg) logger.info(msg)
self.autolog.report()
def run_test(self):
self.resume_or_scratch()
try:
self.test()
except KeyboardInterrupt:
exit(-1)
def export(self): def export(self):
if self.args.model_type == 'offline':
infer_model = DeepSpeech2InferModel.from_pretrained( infer_model = DeepSpeech2InferModel.from_pretrained(
self.test_loader.dataset, self.config, self.args.checkpoint_path) self.test_loader, self.config, self.args.checkpoint_path)
elif self.args.model_type == 'online':
infer_model = DeepSpeech2InferModelOnline.from_pretrained(
self.test_loader, self.config, self.args.checkpoint_path)
else:
raise Exception("wrong model type")
infer_model.eval() infer_model.eval()
#feat_dim = self.test_loader.collate_fn.feature_size
feat_dim = self.test_loader.dataset.feature_size feat_dim = self.test_loader.dataset.feature_size
static_model = paddle.jit.to_static( static_model = infer_model.export()
infer_model,
input_spec=[
paddle.static.InputSpec(
shape=[None, None, feat_dim],
dtype='float32'), # audio, [B,T,D]
paddle.static.InputSpec(shape=[None],
dtype='int64'), # audio_length, [B]
])
logger.info(f"Export code: {static_model.forward.code}") logger.info(f"Export code: {static_model.forward.code}")
paddle.jit.save(static_model, self.args.export_path) paddle.jit.save(static_model, self.args.export_path)
def run_export(self):
try:
self.export()
except KeyboardInterrupt:
exit(-1)
def setup(self): class DeepSpeech2ExportTester(DeepSpeech2Tester):
"""Setup the experiment. def __init__(self, config, args):
""" super().__init__(config, args)
paddle.set_device(self.args.device)
self.setup_output_dir() def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
self.setup_checkpointer() if self.args.model_type == "online":
output_probs, output_lens = self.static_forward_online(audio,
audio_len)
elif self.args.model_type == "offline":
output_probs, output_lens = self.static_forward_offline(audio,
audio_len)
else:
raise Exception("wrong model type")
self.setup_dataloader() self.predictor.clear_intermediate_tensor()
self.setup_model() self.predictor.try_shrink_memory()
self.iteration = 0 self.model.decoder.init_decode(cfg.alpha, cfg.beta, cfg.lang_model_path,
self.epoch = 0 vocab_list, cfg.decoding_method)
def setup_model(self): result_transcripts = self.model.decoder.decode_probs(
config = self.config output_probs, output_lens, vocab_list, cfg.decoding_method,
model = DeepSpeech2Model( cfg.lang_model_path, cfg.alpha, cfg.beta, cfg.beam_size,
feat_size=self.test_loader.dataset.feature_size, cfg.cutoff_prob, cfg.cutoff_top_n, cfg.num_proc_bsearch)
dict_size=self.test_loader.dataset.vocab_size, #replace the <space> with ' '
num_conv_layers=config.model.num_conv_layers, result_transcripts = [
num_rnn_layers=config.model.num_rnn_layers, self._text_featurizer.detokenize(sentence)
rnn_size=config.model.rnn_layer_size, for sentence in result_transcripts
use_gru=config.model.use_gru, ]
share_rnn_weights=config.model.share_rnn_weights)
self.model = model
logger.info("Setup model!")
def setup_dataloader(self): return result_transcripts
config = self.config.clone()
config.defrost()
# return raw text
config.data.manifest = config.data.test_manifest def static_forward_online(self, audio, audio_len,
config.data.keep_transcription_text = True decoder_chunk_size: int=1):
config.data.augmentation_config = ""
# filter test examples, will cause less examples, but no mismatch with training
# and can use large batch size , save training time, so filter test egs now.
# config.data.min_input_len = 0.0 # second
# config.data.max_input_len = float('inf') # second
# config.data.min_output_len = 0.0 # tokens
# config.data.max_output_len = float('inf') # tokens
# config.data.min_output_input_ratio = 0.00
# config.data.max_output_input_ratio = float('inf')
test_dataset = ManifestDataset.from_config(config)
# return text ord id
self.test_loader = DataLoader(
test_dataset,
batch_size=config.decoding.batch_size,
shuffle=False,
drop_last=False,
collate_fn=SpeechCollator(keep_transcription_text=True))
logger.info("Setup test Dataloader!")
def setup_output_dir(self):
"""Create a directory used for output.
""" """
# output dir Parameters
if self.args.output: ----------
output_dir = Path(self.args.output).expanduser() audio (Tensor): shape[B, T, D]
output_dir.mkdir(parents=True, exist_ok=True) audio_len (Tensor): shape[B]
decoder_chunk_size(int)
Returns
-------
output_probs(numpy.array): shape[B, T, vocab_size]
output_lens(numpy.array): shape[B]
"""
output_probs_list = []
output_lens_list = []
subsampling_rate = self.model.encoder.conv.subsampling_rate
receptive_field_length = self.model.encoder.conv.receptive_field_length
chunk_stride = subsampling_rate * decoder_chunk_size
chunk_size = (decoder_chunk_size - 1
) * subsampling_rate + receptive_field_length
x_batch = audio.numpy()
batch_size, Tmax, x_dim = x_batch.shape
x_len_batch = audio_len.numpy().astype(np.int64)
if (Tmax - chunk_size) % chunk_stride != 0:
padding_len_batch = chunk_stride - (
Tmax - chunk_size
) % chunk_stride # The length of padding for the batch
else:
padding_len_batch = 0
x_list = np.split(x_batch, batch_size, axis=0)
x_len_list = np.split(x_len_batch, batch_size, axis=0)
for x, x_len in zip(x_list, x_len_list):
self.autolog.times.start()
self.autolog.times.stamp()
x_len = x_len[0]
assert (chunk_size <= x_len)
if (x_len - chunk_size) % chunk_stride != 0:
padding_len_x = chunk_stride - (x_len - chunk_size
) % chunk_stride
else:
padding_len_x = 0
padding = np.zeros(
(x.shape[0], padding_len_x, x.shape[2]), dtype=x.dtype)
padded_x = np.concatenate([x, padding], axis=1)
num_chunk = (x_len + padding_len_x - chunk_size) / chunk_stride + 1
num_chunk = int(num_chunk)
chunk_state_h_box = np.zeros(
(self.config.model.num_rnn_layers, 1,
self.config.model.rnn_layer_size),
dtype=x.dtype)
chunk_state_c_box = np.zeros(
(self.config.model.num_rnn_layers, 1,
self.config.model.rnn_layer_size),
dtype=x.dtype)
input_names = self.predictor.get_input_names()
audio_handle = self.predictor.get_input_handle(input_names[0])
audio_len_handle = self.predictor.get_input_handle(input_names[1])
h_box_handle = self.predictor.get_input_handle(input_names[2])
c_box_handle = self.predictor.get_input_handle(input_names[3])
probs_chunk_list = []
probs_chunk_lens_list = []
for i in range(0, num_chunk):
start = i * chunk_stride
end = start + chunk_size
x_chunk = padded_x[:, start:end, :]
if x_len < i * chunk_stride:
x_chunk_lens = 0
else: else:
output_dir = Path( x_chunk_lens = min(x_len - i * chunk_stride, chunk_size)
self.args.checkpoint_path).expanduser().parent.parent
output_dir.mkdir(parents=True, exist_ok=True) if (x_chunk_lens <
receptive_field_length): #means the number of input frames in the chunk is not enough for predicting one prob
break
x_chunk_lens = np.array([x_chunk_lens])
audio_handle.reshape(x_chunk.shape)
audio_handle.copy_from_cpu(x_chunk)
audio_len_handle.reshape(x_chunk_lens.shape)
audio_len_handle.copy_from_cpu(x_chunk_lens)
h_box_handle.reshape(chunk_state_h_box.shape)
h_box_handle.copy_from_cpu(chunk_state_h_box)
c_box_handle.reshape(chunk_state_c_box.shape)
c_box_handle.copy_from_cpu(chunk_state_c_box)
output_names = self.predictor.get_output_names()
output_handle = self.predictor.get_output_handle(
output_names[0])
output_lens_handle = self.predictor.get_output_handle(
output_names[1])
output_state_h_handle = self.predictor.get_output_handle(
output_names[2])
output_state_c_handle = self.predictor.get_output_handle(
output_names[3])
self.predictor.run()
output_chunk_probs = output_handle.copy_to_cpu()
output_chunk_lens = output_lens_handle.copy_to_cpu()
chunk_state_h_box = output_state_h_handle.copy_to_cpu()
chunk_state_c_box = output_state_c_handle.copy_to_cpu()
probs_chunk_list.append(output_chunk_probs)
probs_chunk_lens_list.append(output_chunk_lens)
output_probs = np.concatenate(probs_chunk_list, axis=1)
output_lens = np.sum(probs_chunk_lens_list, axis=0)
vocab_size = output_probs.shape[2]
output_probs_padding_len = Tmax + padding_len_batch - output_probs.shape[
1]
output_probs_padding = np.zeros(
(1, output_probs_padding_len, vocab_size),
dtype=output_probs.
dtype) # The prob padding for a piece of utterance
output_probs = np.concatenate(
[output_probs, output_probs_padding], axis=1)
output_probs_list.append(output_probs)
output_lens_list.append(output_lens)
self.autolog.times.stamp()
self.autolog.times.stamp()
self.autolog.times.end()
output_probs = np.concatenate(output_probs_list, axis=0)
output_lens = np.concatenate(output_lens_list, axis=0)
return output_probs, output_lens
def static_forward_offline(self, audio, audio_len):
"""
Parameters
----------
audio (Tensor): shape[B, T, D]
audio_len (Tensor): shape[B]
Returns
-------
output_probs(numpy.array): shape[B, T, vocab_size]
output_lens(numpy.array): shape[B]
"""
x = audio.numpy()
x_len = audio_len.numpy().astype(np.int64)
input_names = self.predictor.get_input_names()
audio_handle = self.predictor.get_input_handle(input_names[0])
audio_len_handle = self.predictor.get_input_handle(input_names[1])
self.output_dir = output_dir audio_handle.reshape(x.shape)
audio_handle.copy_from_cpu(x)
audio_len_handle.reshape(x_len.shape)
audio_len_handle.copy_from_cpu(x_len)
self.autolog.times.start()
self.autolog.times.stamp()
self.predictor.run()
self.autolog.times.stamp()
self.autolog.times.stamp()
self.autolog.times.end()
output_names = self.predictor.get_output_names()
output_handle = self.predictor.get_output_handle(output_names[0])
output_lens_handle = self.predictor.get_output_handle(output_names[1])
output_probs = output_handle.copy_to_cpu()
output_lens = output_lens_handle.copy_to_cpu()
return output_probs, output_lens
def setup_model(self):
super().setup_model()
infer_config = inference.Config(self.args.export_path + ".pdmodel",
self.args.export_path + ".pdiparams")
if (os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''):
infer_config.enable_use_gpu(100, 0)
infer_config.enable_memory_optim()
infer_predictor = inference.create_predictor(infer_config)
self.predictor = infer_predictor
...@@ -30,6 +30,9 @@ def main(config, args): ...@@ -30,6 +30,9 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = default_argument_parser() parser = default_argument_parser()
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
args = parser.parse_args() args = parser.parse_args()
print_arguments(args, globals()) print_arguments(args, globals())
......
...@@ -30,6 +30,9 @@ def main(config, args): ...@@ -30,6 +30,9 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = default_argument_parser() parser = default_argument_parser()
# save jit model to
parser.add_argument(
"--export_path", type=str, help="path of the jit model to save")
args = parser.parse_args() args = parser.parse_args()
print_arguments(args, globals()) print_arguments(args, globals())
......
...@@ -34,6 +34,9 @@ def main(config, args): ...@@ -34,6 +34,9 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = default_argument_parser() parser = default_argument_parser()
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
args = parser.parse_args() args = parser.parse_args()
print_arguments(args, globals()) print_arguments(args, globals())
......
...@@ -22,6 +22,8 @@ from deepspeech.exps.u2.model import U2Trainer as Trainer ...@@ -22,6 +22,8 @@ from deepspeech.exps.u2.model import U2Trainer as Trainer
from deepspeech.training.cli import default_argument_parser from deepspeech.training.cli import default_argument_parser
from deepspeech.utils.utility import print_arguments from deepspeech.utils.utility import print_arguments
# from deepspeech.exps.u2.trainer import U2Trainer as Trainer
def main_sp(config, args): def main_sp(config, args):
exp = Trainer(config, args) exp = Trainer(config, args)
...@@ -30,7 +32,7 @@ def main_sp(config, args): ...@@ -30,7 +32,7 @@ def main_sp(config, args):
def main(config, args): def main(config, args):
if args.device == "gpu" and args.nprocs > 1: if args.nprocs > 0:
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
else: else:
main_sp(config, args) main_sp(config, args)
......
...@@ -73,11 +73,11 @@ class U2Trainer(Trainer): ...@@ -73,11 +73,11 @@ class U2Trainer(Trainer):
def __init__(self, config, args): def __init__(self, config, args):
super().__init__(config, args) super().__init__(config, args)
def train_batch(self, batch_index, batch_data, msg): def train_batch(self, batch_index, batch, msg):
train_conf = self.config.training train_conf = self.config.training
start = time.time() start = time.time()
loss, attention_loss, ctc_loss = self.model(*batch_data) loss, attention_loss, ctc_loss = self.model(*batch)
# loss div by `batch_size * accum_grad` # loss div by `batch_size * accum_grad`
loss /= train_conf.accum_grad loss /= train_conf.accum_grad
loss.backward() loss.backward()
...@@ -219,7 +219,7 @@ class U2Trainer(Trainer): ...@@ -219,7 +219,7 @@ class U2Trainer(Trainer):
config.data.augmentation_config = "" config.data.augmentation_config = ""
dev_dataset = ManifestDataset.from_config(config) dev_dataset = ManifestDataset.from_config(config)
collate_fn = SpeechCollator(keep_transcription_text=False) collate_fn = SpeechCollator(keep_transcription_text=False, return_utts=False)
if self.parallel: if self.parallel:
batch_sampler = SortagradDistributedBatchSampler( batch_sampler = SortagradDistributedBatchSampler(
train_dataset, train_dataset,
...@@ -269,7 +269,7 @@ class U2Trainer(Trainer): ...@@ -269,7 +269,7 @@ class U2Trainer(Trainer):
batch_size=config.decoding.batch_size, batch_size=config.decoding.batch_size,
shuffle=False, shuffle=False,
drop_last=False, drop_last=False,
collate_fn=SpeechCollator(keep_transcription_text=True)) collate_fn=SpeechCollator(keep_transcription_text=True, return_utts=True))
logger.info("Setup train/valid/test Dataloader!") logger.info("Setup train/valid/test Dataloader!")
def setup_model(self): def setup_model(self):
...@@ -428,7 +428,7 @@ class U2Tester(U2Trainer): ...@@ -428,7 +428,7 @@ class U2Tester(U2Trainer):
num_time = 0.0 num_time = 0.0
with open(self.args.result_file, 'w') as fout: with open(self.args.result_file, 'w') as fout:
for i, batch in enumerate(self.test_loader): for i, batch in enumerate(self.test_loader):
metrics = self.compute_metrics(*batch, fout=fout) metrics = self.compute_metrics(*batch[:-1], fout=fout)
num_frames += metrics['num_frames'] num_frames += metrics['num_frames']
num_time += metrics["decode_time"] num_time += metrics["decode_time"]
errors_sum += metrics['errors_sum'] errors_sum += metrics['errors_sum']
...@@ -476,12 +476,12 @@ class U2Tester(U2Trainer): ...@@ -476,12 +476,12 @@ class U2Tester(U2Trainer):
}) })
f.write(data + '\n') f.write(data + '\n')
def run_test(self): # def run_test(self):
self.resume_or_scratch() # self.resume_or_scratch()
try: # try:
self.test() # self.test()
except KeyboardInterrupt: # except KeyboardInterrupt:
sys.exit(-1) # sys.exit(-1)
def load_inferspec(self): def load_inferspec(self):
"""infer model and input spec. """infer model and input spec.
...@@ -512,36 +512,36 @@ class U2Tester(U2Trainer): ...@@ -512,36 +512,36 @@ class U2Tester(U2Trainer):
logger.info(f"Export code: {static_model.forward.code}") logger.info(f"Export code: {static_model.forward.code}")
paddle.jit.save(static_model, self.args.export_path) paddle.jit.save(static_model, self.args.export_path)
def run_export(self): # def run_export(self):
try: # try:
self.export() # self.export()
except KeyboardInterrupt: # except KeyboardInterrupt:
sys.exit(-1) # sys.exit(-1)
def setup(self): # def setup(self):
"""Setup the experiment. # """Setup the experiment.
""" # """
paddle.set_device(self.args.device) # paddle.set_device(self.args.device)
self.setup_output_dir() # self.setup_output_dir()
self.setup_checkpointer() # self.setup_checkpointer()
self.setup_dataloader() # self.setup_dataloader()
self.setup_model() # self.setup_model()
self.iteration = 0 # self.iteration = 0
self.epoch = 0 # self.epoch = 0
def setup_output_dir(self): # def setup_output_dir(self):
"""Create a directory used for output. # """Create a directory used for output.
""" # """
# output dir # # output dir
if self.args.output: # if self.args.output:
output_dir = Path(self.args.output).expanduser() # output_dir = Path(self.args.output).expanduser()
output_dir.mkdir(parents=True, exist_ok=True) # output_dir.mkdir(parents=True, exist_ok=True)
else: # else:
output_dir = Path( # output_dir = Path(
self.args.checkpoint_path).expanduser().parent.parent # self.args.checkpoint_path).expanduser().parent.parent
output_dir.mkdir(parents=True, exist_ok=True) # output_dir.mkdir(parents=True, exist_ok=True)
self.output_dir = output_dir # self.output_dir = output_dir
...@@ -14,12 +14,27 @@ ...@@ -14,12 +14,27 @@
"""Contains the text featurizer class.""" """Contains the text featurizer class."""
import sentencepiece as spm import sentencepiece as spm
from deepspeech.frontend.utility import EOS from ..utility import EOS
from deepspeech.frontend.utility import UNK from ..utility import SPACE
from ..utility import UNK
from ..utility import SOS
from ..utility import BLANK
from ..utility import MASKCTC
from ..utility import load_dict
from deepspeech.utils.log import Log
class TextFeaturizer(object): logger = Log(__name__).getlog()
def __init__(self, unit_type, vocab_filepath, spm_model_prefix=None):
__all__ = ["TextFeaturizer"]
class TextFeaturizer():
def __init__(self,
unit_type,
vocab_filepath,
spm_model_prefix=None,
maskctc=False):
"""Text featurizer, for processing or extracting features from text. """Text featurizer, for processing or extracting features from text.
Currently, it supports char/word/sentence-piece level tokenizing and conversion into Currently, it supports char/word/sentence-piece level tokenizing and conversion into
...@@ -34,20 +49,21 @@ class TextFeaturizer(object): ...@@ -34,20 +49,21 @@ class TextFeaturizer(object):
assert unit_type in ('char', 'spm', 'word') assert unit_type in ('char', 'spm', 'word')
self.unit_type = unit_type self.unit_type = unit_type
self.unk = UNK self.unk = UNK
self.maskctc = maskctc
if vocab_filepath: if vocab_filepath:
self._vocab_dict, self._id2token, self._vocab_list = self._load_vocabulary_from_file( self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id = self._load_vocabulary_from_file(
vocab_filepath) vocab_filepath, maskctc)
self.unk_id = self._vocab_list.index(self.unk) self.vocab_size = len(self.vocab_list)
self.eos_id = self._vocab_list.index(EOS)
if unit_type == 'spm': if unit_type == 'spm':
spm_model = spm_model_prefix + '.model' spm_model = spm_model_prefix + '.model'
self.sp = spm.SentencePieceProcessor() self.sp = spm.SentencePieceProcessor()
self.sp.Load(spm_model) self.sp.Load(spm_model)
def tokenize(self, text): def tokenize(self, text, replace_space=True):
if self.unit_type == 'char': if self.unit_type == 'char':
tokens = self.char_tokenize(text) tokens = self.char_tokenize(text, replace_space)
elif self.unit_type == 'word': elif self.unit_type == 'word':
tokens = self.word_tokenize(text) tokens = self.word_tokenize(text)
else: # spm else: # spm
...@@ -67,7 +83,7 @@ class TextFeaturizer(object): ...@@ -67,7 +83,7 @@ class TextFeaturizer(object):
"""Convert text string to a list of token indices. """Convert text string to a list of token indices.
Args: Args:
text (str): Text to process. text (str): Text.
Returns: Returns:
List[int]: List of token indices. List[int]: List of token indices.
...@@ -75,8 +91,8 @@ class TextFeaturizer(object): ...@@ -75,8 +91,8 @@ class TextFeaturizer(object):
tokens = self.tokenize(text) tokens = self.tokenize(text)
ids = [] ids = []
for token in tokens: for token in tokens:
token = token if token in self._vocab_dict else self.unk token = token if token in self.vocab_dict else self.unk
ids.append(self._vocab_dict[token]) ids.append(self.vocab_dict[token])
return ids return ids
def defeaturize(self, idxs): def defeaturize(self, idxs):
...@@ -87,7 +103,7 @@ class TextFeaturizer(object): ...@@ -87,7 +103,7 @@ class TextFeaturizer(object):
idxs (List[int]): List of token indices. idxs (List[int]): List of token indices.
Returns: Returns:
str: Text to process. str: Text.
""" """
tokens = [] tokens = []
for idx in idxs: for idx in idxs:
...@@ -97,43 +113,22 @@ class TextFeaturizer(object): ...@@ -97,43 +113,22 @@ class TextFeaturizer(object):
text = self.detokenize(tokens) text = self.detokenize(tokens)
return text return text
@property def char_tokenize(self, text, replace_space=True):
def vocab_size(self):
"""Return the vocabulary size.
:return: Vocabulary size.
:rtype: int
"""
return len(self._vocab_list)
@property
def vocab_list(self):
"""Return the vocabulary in list.
Returns:
List[str]: tokens.
"""
return self._vocab_list
@property
def vocab_dict(self):
"""Return the vocabulary in dict.
Returns:
Dict[str, int]: token str -> int
"""
return self._vocab_dict
def char_tokenize(self, text):
"""Character tokenizer. """Character tokenizer.
Args: Args:
text (str): text string. text (str): text string.
replace_space (bool): False only used by build_vocab.py.
Returns: Returns:
List[str]: tokens. List[str]: tokens.
""" """
return list(text.strip()) text = text.strip()
if replace_space:
text_list = [SPACE if item == " " else item for item in list(text)]
else:
text_list = list(text)
return text_list
def char_detokenize(self, tokens): def char_detokenize(self, tokens):
"""Character detokenizer. """Character detokenizer.
...@@ -144,6 +139,7 @@ class TextFeaturizer(object): ...@@ -144,6 +139,7 @@ class TextFeaturizer(object):
Returns: Returns:
str: text string. str: text string.
""" """
tokens = tokens.replace(SPACE, " ")
return "".join(tokens) return "".join(tokens)
def word_tokenize(self, text): def word_tokenize(self, text):
...@@ -206,14 +202,28 @@ class TextFeaturizer(object): ...@@ -206,14 +202,28 @@ class TextFeaturizer(object):
return decode(tokens) return decode(tokens)
def _load_vocabulary_from_file(self, vocab_filepath): def _load_vocabulary_from_file(self, vocab_filepath: str, maskctc: bool):
"""Load vocabulary from file.""" """Load vocabulary from file."""
vocab_lines = [] vocab_list = load_dict(vocab_filepath, maskctc)
with open(vocab_filepath, 'r', encoding='utf-8') as file: assert vocab_list is not None
vocab_lines.extend(file.readlines()) logger.info(f"Vocab: {vocab_list}")
vocab_list = [line[:-1] for line in vocab_lines]
id2token = dict( id2token = dict(
[(idx, token) for (idx, token) in enumerate(vocab_list)]) [(idx, token) for (idx, token) in enumerate(vocab_list)])
token2id = dict( token2id = dict(
[(token, idx) for (idx, token) in enumerate(vocab_list)]) [(token, idx) for (idx, token) in enumerate(vocab_list)])
return token2id, id2token, vocab_list
blank_id = vocab_list.index(BLANK) if BLANK in vocab_list else -1
maskctc_id = vocab_list.index(MASKCTC) if MASKCTC in vocab_list else -1
unk_id = vocab_list.index(UNK) if UNK in vocab_list else -1
eos_id = vocab_list.index(EOS) if EOS in vocab_list else -1
sos_id = vocab_list.index(SOS) if SOS in vocab_list else -1
space_id = vocab_list.index(SPACE) if SPACE in vocab_list else -1
logger.info(f"UNK id: {unk_id}")
logger.info(f"EOS id: {eos_id}")
logger.info(f"SOS id: {sos_id}")
logger.info(f"SPACE id: {space_id}")
logger.info(f"BLANK id: {blank_id}")
logger.info(f"MASKCTC id: {maskctc_id}")
return token2id, id2token, vocab_list, unk_id, eos_id
...@@ -12,10 +12,15 @@ ...@@ -12,10 +12,15 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Contains data helper functions.""" """Contains data helper functions."""
import codecs
import json import json
import math import math
import tarfile
from collections import namedtuple
from typing import List
from typing import Optional
from typing import Text
import jsonlines
import numpy as np import numpy as np
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
...@@ -23,16 +28,40 @@ from deepspeech.utils.log import Log ...@@ -23,16 +28,40 @@ from deepspeech.utils.log import Log
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
__all__ = [ __all__ = [
"load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs", "max_dbfs", "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
"mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS", "EOS", "UNK", "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
"BLANK" "EOS", "UNK", "BLANK", "MASKCTC", "SPACE"
] ]
IGNORE_ID = -1 IGNORE_ID = -1
SOS = "<sos/eos>" # `sos` and `eos` using same token
SOS = "<eos>"
EOS = SOS EOS = SOS
UNK = "<unk>" UNK = "<unk>"
BLANK = "<blank>" BLANK = "<blank>"
MASKCTC = "<mask>"
SPACE = "<space>"
def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
if dict_path is None:
return None
with open(dict_path, "r") as f:
dictionary = f.readlines()
# first token is `<blank>`
# multi line: `<blank> 0\n`
# one line: `<blank>`
# space is relpace with <space>
char_list = [entry[:-1].split(" ")[0] for entry in dictionary]
if BLANK not in char_list:
char_list.insert(0, BLANK)
if EOS not in char_list:
char_list.append(EOS)
# for non-autoregressive maskctc model
if maskctc and MASKCTC not in char_list:
char_list.append(MASKCTC)
return char_list
def read_manifest( def read_manifest(
...@@ -47,12 +76,20 @@ def read_manifest( ...@@ -47,12 +76,20 @@ def read_manifest(
Args: Args:
manifest_path ([type]): Manifest file to load and parse. manifest_path ([type]): Manifest file to load and parse.
max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf'). max_input_len ([type], optional): maximum output seq length,
min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0. in seconds for raw wav, in frame numbers for feature data.
max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0. Defaults to float('inf').
min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0. min_input_len (float, optional): minimum input seq length,
max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0. in seconds for raw wav, in frame numbers for feature data.
min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05. Defaults to 0.0.
max_output_len (float, optional): maximum input seq length,
in modeling units. Defaults to 500.0.
min_output_len (float, optional): minimum input seq length,
in modeling units. Defaults to 0.0.
max_output_input_ratio (float, optional):
maximum output seq length/output seq length ratio. Defaults to 10.0.
min_output_input_ratio (float, optional):
minimum output seq length/output seq length ratio. Defaults to 0.05.
Raises: Raises:
IOError: If failed to parse the manifest. IOError: If failed to parse the manifest.
...@@ -62,12 +99,8 @@ def read_manifest( ...@@ -62,12 +99,8 @@ def read_manifest(
""" """
manifest = [] manifest = []
for json_line in codecs.open(manifest_path, 'r', 'utf-8'): with jsonlines.open(manifest_path, 'r') as reader:
try: for json_data in reader:
json_data = json.loads(json_line)
except Exception as e:
raise IOError("Error reading manifest: %s" % str(e))
feat_len = json_data["feat_shape"][ feat_len = json_data["feat_shape"][
0] if 'feat_shape' in json_data else 1.0 0] if 'feat_shape' in json_data else 1.0
token_len = json_data["token_shape"][ token_len = json_data["token_shape"][
...@@ -85,6 +118,51 @@ def read_manifest( ...@@ -85,6 +118,51 @@ def read_manifest(
return manifest return manifest
# Tar File read
TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
def parse_tar(file):
"""Parse a tar file to get a tarfile object
and a map containing tarinfoes
"""
result = {}
f = tarfile.open(file)
for tarinfo in f.getmembers():
result[tarinfo.name] = tarinfo
return f, result
def subfile_from_tar(file, local_data=None):
"""Get subfile object from tar.
tar:tarpath#filename
It will return a subfile object from tar file
and cached tar file info for next reading request.
"""
tarpath, filename = file.split(':', 1)[1].split('#', 1)
if local_data is None:
local_data = TarLocalData(tar2info={}, tar2object={})
assert isinstance(local_data, TarLocalData)
if 'tar2info' not in local_data.__dict__:
local_data.tar2info = {}
if 'tar2object' not in local_data.__dict__:
local_data.tar2object = {}
if tarpath not in local_data.tar2info:
fobj, infos = parse_tar(tarpath)
local_data.tar2info[tarpath] = infos
local_data.tar2object[tarpath] = fobj
else:
fobj = local_data.tar2object[tarpath]
infos = local_data.tar2info[tarpath]
return fobj.extractfile(infos[filename])
def rms_to_db(rms: float): def rms_to_db(rms: float):
"""Root Mean Square to dB. """Root Mean Square to dB.
...@@ -254,6 +332,13 @@ def load_cmvn(cmvn_file: str, filetype: str): ...@@ -254,6 +332,13 @@ def load_cmvn(cmvn_file: str, filetype: str):
cmvn = _load_json_cmvn(cmvn_file) cmvn = _load_json_cmvn(cmvn_file)
elif filetype == "kaldi": elif filetype == "kaldi":
cmvn = _load_kaldi_cmvn(cmvn_file) cmvn = _load_kaldi_cmvn(cmvn_file)
elif filetype == "npz":
eps = 1e-14
npzfile = np.load(cmvn_file)
mean = np.squeeze(npzfile["mean"])
std = np.squeeze(npzfile["std"])
istd = 1 / (std + eps)
cmvn = [mean, istd]
else: else:
raise ValueError(f"cmvn file type no support: {filetype}") raise ValueError(f"cmvn file type no support: {filetype}")
return cmvn[0], cmvn[1] return cmvn[0], cmvn[1]
...@@ -23,7 +23,7 @@ logger = Log(__name__).getlog() ...@@ -23,7 +23,7 @@ logger = Log(__name__).getlog()
class SpeechCollator(): class SpeechCollator():
def __init__(self, keep_transcription_text=True): def __init__(self, keep_transcription_text=True, return_utts=False):
""" """
Padding audio features with zeros to make them have the same shape (or Padding audio features with zeros to make them have the same shape (or
a user-defined shape) within one bach. a user-defined shape) within one bach.
...@@ -31,6 +31,7 @@ class SpeechCollator(): ...@@ -31,6 +31,7 @@ class SpeechCollator():
if ``keep_transcription_text`` is False, text is token ids else is raw string. if ``keep_transcription_text`` is False, text is token ids else is raw string.
""" """
self._keep_transcription_text = keep_transcription_text self._keep_transcription_text = keep_transcription_text
self.return_utts = return_utts
def __call__(self, batch): def __call__(self, batch):
"""batch examples """batch examples
...@@ -51,7 +52,9 @@ class SpeechCollator(): ...@@ -51,7 +52,9 @@ class SpeechCollator():
audio_lens = [] audio_lens = []
texts = [] texts = []
text_lens = [] text_lens = []
for audio, text in batch: utts = []
for utt, audio, text in batch:
utts.append(utt)
# audio # audio
audios.append(audio.T) # [T, D] audios.append(audio.T) # [T, D]
audio_lens.append(audio.shape[1]) audio_lens.append(audio.shape[1])
...@@ -75,4 +78,7 @@ class SpeechCollator(): ...@@ -75,4 +78,7 @@ class SpeechCollator():
padded_texts = pad_sequence( padded_texts = pad_sequence(
texts, padding_value=IGNORE_ID).astype(np.int64) texts, padding_value=IGNORE_ID).astype(np.int64)
text_lens = np.array(text_lens).astype(np.int64) text_lens = np.array(text_lens).astype(np.int64)
if self.return_utts:
return padded_audios, audio_lens, padded_texts, text_lens, utts
else:
return padded_audios, audio_lens, padded_texts, text_lens return padded_audios, audio_lens, padded_texts, text_lens
\ No newline at end of file
...@@ -347,4 +347,5 @@ class ManifestDataset(Dataset): ...@@ -347,4 +347,5 @@ class ManifestDataset(Dataset):
def __getitem__(self, idx): def __getitem__(self, idx):
instance = self._manifest[idx] instance = self._manifest[idx]
return self.process_utterance(instance["feat"], instance["text"]) feat, text = self.process_utterance(instance["feat"], instance["text"])
return instance["utt"], feat, text
...@@ -114,7 +114,7 @@ class ConvBn(nn.Layer): ...@@ -114,7 +114,7 @@ class ConvBn(nn.Layer):
masks = make_non_pad_mask(x_len) #[B, T] masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]
# TODO(Hui Zhang): not support bool multiply # TODO(Hui Zhang): not support bool multiply
masks = masks.type_as(x) masks = masks.astype(x.dtype)
x = x.multiply(masks) x = x.multiply(masks)
return x, x_len return x, x_len
......
...@@ -219,15 +219,17 @@ class DeepSpeech2Model(nn.Layer): ...@@ -219,15 +219,17 @@ class DeepSpeech2Model(nn.Layer):
The model built from pretrained result. The model built from pretrained result.
""" """
model = cls( model = cls(
feat_size=dataloader.collate_fn.feature_size, #feat_size=dataloader.collate_fn.feature_size,
dict_size=dataloader.collate_fn.vocab_size, feat_size=dataloader.dataset.feature_size,
#dict_size=dataloader.collate_fn.vocab_size,
dict_size=dataloader.dataset.vocab_size,
num_conv_layers=config.model.num_conv_layers, num_conv_layers=config.model.num_conv_layers,
num_rnn_layers=config.model.num_rnn_layers, num_rnn_layers=config.model.num_rnn_layers,
rnn_size=config.model.rnn_layer_size, rnn_size=config.model.rnn_layer_size,
use_gru=config.model.use_gru, use_gru=config.model.use_gru,
share_rnn_weights=config.model.share_rnn_weights, share_rnn_weights=config.model.share_rnn_weights,
blank_id=config.model.blank_id, blank_id=config.model.blank_id,
ctc_grad_norm_type=config.ctc_grad_norm_type, ) ctc_grad_norm_type=config.model.ctc_grad_norm_type, )
infos = Checkpoint().load_parameters( infos = Checkpoint().load_parameters(
model, checkpoint_path=checkpoint_path) model, checkpoint_path=checkpoint_path)
logger.info(f"checkpoint info: {infos}") logger.info(f"checkpoint info: {infos}")
...@@ -260,24 +262,8 @@ class DeepSpeech2Model(nn.Layer): ...@@ -260,24 +262,8 @@ class DeepSpeech2Model(nn.Layer):
class DeepSpeech2InferModel(DeepSpeech2Model): class DeepSpeech2InferModel(DeepSpeech2Model):
def __init__(self, def __init__(self, *args, **kwargs):
feat_size, super().__init__(*args, **kwargs)
dict_size,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=1024,
use_gru=False,
share_rnn_weights=True,
blank_id=0):
super().__init__(
feat_size=feat_size,
dict_size=dict_size,
num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers,
rnn_size=rnn_size,
use_gru=use_gru,
share_rnn_weights=share_rnn_weights,
blank_id=blank_id)
def forward(self, audio, audio_len): def forward(self, audio, audio_len):
"""export model function """export model function
......
...@@ -309,6 +309,6 @@ class RNNStack(nn.Layer): ...@@ -309,6 +309,6 @@ class RNNStack(nn.Layer):
masks = make_non_pad_mask(x_len) #[B, T] masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(-1) # [B, T, 1] masks = masks.unsqueeze(-1) # [B, T, 1]
# TODO(Hui Zhang): not support bool multiply # TODO(Hui Zhang): not support bool multiply
masks = masks.type_as(x) masks = masks.astype(x.dtype)
x = x.multiply(masks) x = x.multiply(masks)
return x, x_len return x, x_len
...@@ -255,12 +255,13 @@ class DeepSpeech2ModelOnline(nn.Layer): ...@@ -255,12 +255,13 @@ class DeepSpeech2ModelOnline(nn.Layer):
fc_layers_size_list=[512, 256], fc_layers_size_list=[512, 256],
use_gru=True, #Use gru if set True. Use simple rnn if set False. use_gru=True, #Use gru if set True. Use simple rnn if set False.
blank_id=0, # index of blank in vocob.txt blank_id=0, # index of blank in vocob.txt
)) ctc_grad_norm_type='instance', ))
if config is not None: if config is not None:
config.merge_from_other_cfg(default) config.merge_from_other_cfg(default)
return default return default
def __init__(self, def __init__(
self,
feat_size, feat_size,
dict_size, dict_size,
num_conv_layers=2, num_conv_layers=2,
...@@ -270,7 +271,8 @@ class DeepSpeech2ModelOnline(nn.Layer): ...@@ -270,7 +271,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
num_fc_layers=2, num_fc_layers=2,
fc_layers_size_list=[512, 256], fc_layers_size_list=[512, 256],
use_gru=False, use_gru=False,
blank_id=0): blank_id=0,
ctc_grad_norm_type='instance', ):
super().__init__() super().__init__()
self.encoder = CRNNEncoder( self.encoder = CRNNEncoder(
feat_size=feat_size, feat_size=feat_size,
...@@ -290,7 +292,7 @@ class DeepSpeech2ModelOnline(nn.Layer): ...@@ -290,7 +292,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
dropout_rate=0.0, dropout_rate=0.0,
reduction=True, # sum reduction=True, # sum
batch_average=True, # sum / batch_size batch_average=True, # sum / batch_size
grad_norm_type='instance') grad_norm_type=ctc_grad_norm_type)
def forward(self, audio, audio_len, text, text_len): def forward(self, audio, audio_len, text, text_len):
"""Compute Model loss """Compute Model loss
...@@ -348,7 +350,8 @@ class DeepSpeech2ModelOnline(nn.Layer): ...@@ -348,7 +350,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
DeepSpeech2ModelOnline DeepSpeech2ModelOnline
The model built from pretrained result. The model built from pretrained result.
""" """
model = cls(feat_size=dataloader.collate_fn.feature_size, model = cls(
feat_size=dataloader.collate_fn.feature_size,
dict_size=dataloader.collate_fn.vocab_size, dict_size=dataloader.collate_fn.vocab_size,
num_conv_layers=config.model.num_conv_layers, num_conv_layers=config.model.num_conv_layers,
num_rnn_layers=config.model.num_rnn_layers, num_rnn_layers=config.model.num_rnn_layers,
...@@ -357,7 +360,8 @@ class DeepSpeech2ModelOnline(nn.Layer): ...@@ -357,7 +360,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
num_fc_layers=config.model.num_fc_layers, num_fc_layers=config.model.num_fc_layers,
fc_layers_size_list=config.model.fc_layers_size_list, fc_layers_size_list=config.model.fc_layers_size_list,
use_gru=config.model.use_gru, use_gru=config.model.use_gru,
blank_id=config.model.blank_id) blank_id=config.model.blank_id,
ctc_grad_norm_type=config.model.ctc_grad_norm_type, )
infos = Checkpoint().load_parameters( infos = Checkpoint().load_parameters(
model, checkpoint_path=checkpoint_path) model, checkpoint_path=checkpoint_path)
logger.info(f"checkpoint info: {infos}") logger.info(f"checkpoint info: {infos}")
...@@ -376,7 +380,8 @@ class DeepSpeech2ModelOnline(nn.Layer): ...@@ -376,7 +380,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
DeepSpeech2ModelOnline DeepSpeech2ModelOnline
The model built from config. The model built from config.
""" """
model = cls(feat_size=config.feat_size, model = cls(
feat_size=config.feat_size,
dict_size=config.dict_size, dict_size=config.dict_size,
num_conv_layers=config.num_conv_layers, num_conv_layers=config.num_conv_layers,
num_rnn_layers=config.num_rnn_layers, num_rnn_layers=config.num_rnn_layers,
...@@ -385,33 +390,14 @@ class DeepSpeech2ModelOnline(nn.Layer): ...@@ -385,33 +390,14 @@ class DeepSpeech2ModelOnline(nn.Layer):
num_fc_layers=config.num_fc_layers, num_fc_layers=config.num_fc_layers,
fc_layers_size_list=config.fc_layers_size_list, fc_layers_size_list=config.fc_layers_size_list,
use_gru=config.use_gru, use_gru=config.use_gru,
blank_id=config.blank_id) blank_id=config.blank_id,
ctc_grad_norm_type=config.ctc_grad_norm_type, )
return model return model
class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
def __init__(self, def __init__(self, *args, **kwargs):
feat_size, super().__init__(*args, **kwargs)
dict_size,
num_conv_layers=2,
num_rnn_layers=4,
rnn_size=1024,
rnn_direction='forward',
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=False,
blank_id=0):
super().__init__(
feat_size=feat_size,
dict_size=dict_size,
num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers,
rnn_size=rnn_size,
rnn_direction=rnn_direction,
num_fc_layers=num_fc_layers,
fc_layers_size_list=fc_layers_size_list,
use_gru=use_gru,
blank_id=blank_id)
def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box, def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box,
chunk_state_c_box): chunk_state_c_box):
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import time import time
from contextlib import contextmanager
from pathlib import Path from pathlib import Path
import paddle import paddle
...@@ -78,7 +79,7 @@ class Trainer(): ...@@ -78,7 +79,7 @@ class Trainer():
>>> config.merge_from_list(args.opts) >>> config.merge_from_list(args.opts)
>>> config.freeze() >>> config.freeze()
>>> >>>
>>> if args.nprocs > 1 and args.device == "gpu": >>> if args.nprocs > 0:
>>> dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) >>> dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
>>> else: >>> else:
>>> main_sp(config, args) >>> main_sp(config, args)
...@@ -93,18 +94,24 @@ class Trainer(): ...@@ -93,18 +94,24 @@ class Trainer():
self.checkpoint_dir = None self.checkpoint_dir = None
self.iteration = 0 self.iteration = 0
self.epoch = 0 self.epoch = 0
self._train = True
def setup(self): paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
"""Setup the experiment.
"""
paddle.set_device(self.args.device)
if self.parallel: if self.parallel:
self.init_parallel() self.init_parallel()
@contextmanager
def eval(self):
self._train = False
yield
self._train = True
def setup(self):
"""Setup the experiment.
"""
self.setup_output_dir() self.setup_output_dir()
self.dump_config() self.dump_config()
self.setup_visualizer() self.setup_visualizer()
self.setup_checkpointer()
self.setup_dataloader() self.setup_dataloader()
self.setup_model() self.setup_model()
...@@ -117,7 +124,7 @@ class Trainer(): ...@@ -117,7 +124,7 @@ class Trainer():
"""A flag indicating whether the experiment should run with """A flag indicating whether the experiment should run with
multiprocessing. multiprocessing.
""" """
return self.args.device == "gpu" and self.args.nprocs > 1 return self.args.nprocs > 1
def init_parallel(self): def init_parallel(self):
"""Init environment for multiprocess training. """Init environment for multiprocess training.
...@@ -158,8 +165,8 @@ class Trainer(): ...@@ -158,8 +165,8 @@ class Trainer():
checkpoint_path=self.args.checkpoint_path) checkpoint_path=self.args.checkpoint_path)
if infos: if infos:
# restore from ckpt # restore from ckpt
self.iteration = infos["step"] self.iteration = infos["step"] + 1
self.epoch = infos["epoch"] self.epoch = infos["epoch"] + 1
scratch = False scratch = False
else: else:
self.iteration = 0 self.iteration = 0
...@@ -237,31 +244,61 @@ class Trainer(): ...@@ -237,31 +244,61 @@ class Trainer():
try: try:
self.train() self.train()
except KeyboardInterrupt: except KeyboardInterrupt:
self.save()
exit(-1) exit(-1)
finally: finally:
self.destory() self.destory()
logger.info("Training Done.") logger.info("Train Done.")
def run_test(self):
"""Do Test/Decode"""
with self.eval():
self.resume_or_scratch()
try:
self.test()
except KeyboardInterrupt:
exit(-1)
logger.info("Test/Decode Done.")
def run_export(self):
"""Do Model Export"""
with self.eval():
try:
self.export()
except KeyboardInterrupt:
exit(-1)
logger.info("Export Done.")
def setup_output_dir(self): def setup_output_dir(self):
"""Create a directory used for output. """Create a directory used for output.
""" """
# output dir if self.args.output:
output_dir = Path(self.args.output).expanduser() output_dir = Path(self.args.output).expanduser()
output_dir.mkdir(parents=True, exist_ok=True) elif self.args.checkpoint_path:
output_dir = Path(
self.args.checkpoint_path).expanduser().parent.parent
self.output_dir = output_dir self.output_dir = output_dir
self.output_dir.mkdir(parents=True, exist_ok=True)
def setup_checkpointer(self): self.checkpoint_dir = self.output_dir / "checkpoints"
"""Create a directory used to save checkpoints into. self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
It is "checkpoints" inside the output directory. self.log_dir = output_dir / "log"
""" self.log_dir.mkdir(parents=True, exist_ok=True)
# checkpoint dir
checkpoint_dir = self.output_dir / "checkpoints" self.test_dir = output_dir / "test"
checkpoint_dir.mkdir(exist_ok=True) self.test_dir.mkdir(parents=True, exist_ok=True)
self.decode_dir = output_dir / "decode"
self.decode_dir.mkdir(parents=True, exist_ok=True)
self.export_dir = output_dir / "export"
self.export_dir.mkdir(parents=True, exist_ok=True)
self.visual_dir = output_dir / "visual"
self.visual_dir.mkdir(parents=True, exist_ok=True)
self.checkpoint_dir = checkpoint_dir self.config_dir = output_dir / "conf"
self.config_dir.mkdir(parents=True, exist_ok=True)
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
def destory(self): def destory(self):
...@@ -283,7 +320,7 @@ class Trainer(): ...@@ -283,7 +320,7 @@ class Trainer():
unexpected behaviors. unexpected behaviors.
""" """
# visualizer # visualizer
visualizer = SummaryWriter(logdir=str(self.output_dir)) visualizer = SummaryWriter(logdir=str(self.visual_dir))
self.visualizer = visualizer self.visualizer = visualizer
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
...@@ -293,7 +330,14 @@ class Trainer(): ...@@ -293,7 +330,14 @@ class Trainer():
It is saved in to ``config.yaml`` in the output directory at the It is saved in to ``config.yaml`` in the output directory at the
beginning of the experiment. beginning of the experiment.
""" """
with open(self.output_dir / "config.yaml", 'wt') as f: config_file = self.config_dir / "config.yaml"
if self._train and config_file.exists():
time_stamp = time.strftime("%Y_%m_%d_%H_%M_%s", time.gmtime())
target_path = self.config_dir / ".".join(
[time_stamp, "config.yaml"])
config_file.rename(target_path)
with open(config_file, 'wt') as f:
print(self.config, file=f) print(self.config, file=f)
def train_batch(self): def train_batch(self):
...@@ -307,6 +351,18 @@ class Trainer(): ...@@ -307,6 +351,18 @@ class Trainer():
""" """
raise NotImplementedError("valid should be implemented.") raise NotImplementedError("valid should be implemented.")
@paddle.no_grad()
def test(self):
"""The test. A subclass should implement this method in Tester.
"""
raise NotImplementedError("test should be implemented.")
@paddle.no_grad()
def export(self):
"""The test. A subclass should implement this method in Tester.
"""
raise NotImplementedError("export should be implemented.")
def setup_model(self): def setup_model(self):
"""Setup model, criterion and optimizer, etc. A subclass should """Setup model, criterion and optimizer, etc. A subclass should
implement this method. implement this method.
......
...@@ -120,14 +120,15 @@ class Autolog: ...@@ -120,14 +120,15 @@ class Autolog:
model_precision="fp32"): model_precision="fp32"):
import auto_log import auto_log
pid = os.getpid() pid = os.getpid()
if (os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''): if os.environ.get('CUDA_VISIBLE_DEVICES', None):
gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]) gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0])
infer_config = inference.Config() infer_config = inference.Config()
infer_config.enable_use_gpu(100, gpu_id) infer_config.enable_use_gpu(100, gpu_id)
else: else:
gpu_id = None gpu_id = None
infer_config = inference.Config() infer_config = inference.Config()
autolog = auto_log.AutoLogger(
self.autolog = auto_log.AutoLogger(
model_name=model_name, model_name=model_name,
model_precision=model_precision, model_precision=model_precision,
batch_size=batch_size, batch_size=batch_size,
...@@ -139,7 +140,6 @@ class Autolog: ...@@ -139,7 +140,6 @@ class Autolog:
gpu_ids=gpu_id, gpu_ids=gpu_id,
time_keys=['preprocess_time', 'inference_time', 'postprocess_time'], time_keys=['preprocess_time', 'inference_time', 'postprocess_time'],
warmup=0) warmup=0)
self.autolog = autolog
def getlog(self): def getlog(self):
return self.autolog return self.autolog
...@@ -2,3 +2,4 @@ dev-clean/ ...@@ -2,3 +2,4 @@ dev-clean/
manifest.dev-clean manifest.dev-clean
manifest.train-clean manifest.train-clean
train-clean/ train-clean/
*.meta
...@@ -58,6 +58,10 @@ def create_manifest(data_dir, manifest_path): ...@@ -58,6 +58,10 @@ def create_manifest(data_dir, manifest_path):
""" """
print("Creating manifest %s ..." % manifest_path) print("Creating manifest %s ..." % manifest_path)
json_lines = [] json_lines = []
total_sec = 0.0
total_text = 0.0
total_num = 0
for subfolder, _, filelist in sorted(os.walk(data_dir)): for subfolder, _, filelist in sorted(os.walk(data_dir)):
text_filelist = [ text_filelist = [
filename for filename in filelist if filename.endswith('trans.txt') filename for filename in filelist if filename.endswith('trans.txt')
...@@ -80,10 +84,27 @@ def create_manifest(data_dir, manifest_path): ...@@ -80,10 +84,27 @@ def create_manifest(data_dir, manifest_path):
'text': 'text':
text text
})) }))
total_sec += duration
total_text += len(text)
total_num += 1
with codecs.open(manifest_path, 'w', 'utf-8') as out_file: with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
for line in json_lines: for line in json_lines:
out_file.write(line + '\n') out_file.write(line + '\n')
subset = os.path.splitext(manifest_path)[1][1:]
manifest_dir = os.path.dirname(manifest_path)
data_dir_name = os.path.split(data_dir)[-1]
meta_path = os.path.join(manifest_dir, data_dir_name) + '.meta'
with open(meta_path, 'w') as f:
print(f"{subset}:", file=f)
print(f"{total_num} utts", file=f)
print(f"{total_sec / (60*60)} h", file=f)
print(f"{total_text} text", file=f)
print(f"{total_text / total_sec} text/sec", file=f)
print(f"{total_sec / total_num} sec/utt", file=f)
def prepare_dataset(url, md5sum, target_dir, manifest_path): def prepare_dataset(url, md5sum, target_dir, manifest_path):
"""Download, unpack and create summmary manifest file. """Download, unpack and create summmary manifest file.
......
#!/bin/bash
if [ $# != 2 ];then
echo "usage: ${0} config_path ckpt_path_prefix"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
ckpt_prefix=$2
batch_size=1
output_dir=${ckpt_prefix}
mkdir -p ${output_dir}
# align dump in `result_file`
# .tier, .TextGrid dump in `dir of result_file`
python3 -u ${BIN_DIR}/alignment.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${output_dir}/${type}.align \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in ctc alignment!"
exit 1
fi
exit 0
#! /usr/bin/env bash #!/bin/bash
stage=-1 stage=-1
stop_stage=100 stop_stage=100
......
#! /usr/bin/env bash #!/bin/bash
. ${MAIN_ROOT}/utils/utility.sh . ${MAIN_ROOT}/utils/utility.sh
......
#! /usr/bin/env bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 3 ];then
echo "usage: $0 config_path ckpt_prefix jit_model_path" echo "usage: $0 config_path ckpt_prefix jit_model_path"
...@@ -12,13 +12,7 @@ config_path=$1 ...@@ -12,13 +12,7 @@ config_path=$1
ckpt_path_prefix=$2 ckpt_path_prefix=$2
jit_model_export_path=$3 jit_model_export_path=$3
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
python3 -u ${BIN_DIR}/export.py \ python3 -u ${BIN_DIR}/export.py \
--device ${device} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--checkpoint_path ${ckpt_path_prefix} \ --checkpoint_path ${ckpt_path_prefix} \
......
#! /usr/bin/env bash #!/bin/bash
if [ $# != 2 ];then set -e
echo "usage: ${0} config_path ckpt_path_prefix"
expdir=exp
datadir=data
nj=32
lmtag=
recog_set="test-clean test-other dev-clean dev-other"
recog_set="test-clean"
# bpemode (unigram or bpe)
nbpe=5000
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
bpemodel=${bpeprefix}.model
if [ $# != 3 ];then
echo "usage: ${0} config_path dict_path ckpt_path_prefix"
exit -1 exit -1
fi fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
config_path=$1 config_path=$1
ckpt_prefix=$2 dict=$2
ckpt_prefix=$3
chunk_mode=false
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
chunk_mode=true
fi
echo "chunk mode ${chunk_mode}"
# download language model # download language model
#bash local/download_lm_en.sh #bash local/download_lm_en.sh
...@@ -21,39 +42,46 @@ ckpt_prefix=$2 ...@@ -21,39 +42,46 @@ ckpt_prefix=$2
# exit 1 # exit 1
#fi #fi
for type in attention ctc_greedy_search; do pids=() # initialize pids
echo "decoding ${type}"
batch_size=64
python3 -u ${BIN_DIR}/test.py \
--device ${device} \
--nproc 1 \
--config ${config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do
echo "Failed in evaluation!" (
exit 1 for rtask in ${recog_set}; do
fi (
done decode_dir=decode_${rtask}_${dmethd}_$(basename ${config_path%.*})_${lmtag}
feat_recog_dir=${datadir}
mkdir -p ${expdir}/${decode_dir}
mkdir -p ${feat_recog_dir}
# split data
split_json.sh ${feat_recog_dir}/manifest.${rtask} ${nj}
for type in ctc_prefix_beam_search attention_rescoring; do #### use CPU for decoding
echo "decoding ${type}" ngpu=0
# set batchsize 0 to disable batch decoding
batch_size=1 batch_size=1
${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--device ${device} \ --nproc ${ngpu} \
--nproc 1 \
--config ${config_path} \ --config ${config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \ --result_file ${expdir}/${decode_dir}/data.JOB.json \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} decoding.batch_size ${batch_size} --opts decoding.decoding_method ${dmethd} \
--opts decoding.batch_size ${batch_size} \
--opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask}
score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
if [ $? -ne 0 ]; then ) &
echo "Failed in evaluation!" pids+=($!) # store background pids
exit 1 done
fi ) &
pids+=($!) # store background pids
done done
i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
[ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
echo "Finished"
exit 0 exit 0
#! /usr/bin/env bash #!/bin/bash
if [ $# != 2 ];then if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
...@@ -11,19 +11,28 @@ echo "using $ngpu gpus..." ...@@ -11,19 +11,28 @@ echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_name=$2 ckpt_name=$2
device=gpu mkdir -p exp
if [ ngpu == 0 ];then
device=cpu # seed may break model convergence
seed=0
if [ ${seed} != 0 ]; then
#export FLAGS_cudnn_deterministic=True
echo "None"
fi fi
echo "using ${device}..."
mkdir -p exp # export FLAGS_cudnn_exhaustive_search=true
# export FLAGS_conv_workspace_size_limit=4000
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--device ${device} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} --output exp/${ckpt_name} \
--seed ${seed}
if [ ${seed} != 0 ]; then
#unset FLAGS_cudnn_deterministic
echo "None"
fi
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in training!" echo "Failed in training!"
......
...@@ -4,6 +4,7 @@ data: ...@@ -4,6 +4,7 @@ data:
dev_manifest: data/manifest.tiny dev_manifest: data/manifest.tiny
test_manifest: data/manifest.tiny test_manifest: data/manifest.tiny
mean_std_filepath: data/mean_std.json mean_std_filepath: data/mean_std.json
unit_type: char
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 4 batch_size: 4
...@@ -35,6 +36,8 @@ model: ...@@ -35,6 +36,8 @@ model:
rnn_layer_size: 2048 rnn_layer_size: 2048
use_gru: False use_gru: False
share_rnn_weights: True share_rnn_weights: True
blank_id: 0
ctc_grad_norm_type: instance
training: training:
n_epoch: 20 n_epoch: 20
......
# https://yaml.org/type/float.html
data:
train_manifest: data/manifest.tiny
dev_manifest: data/manifest.tiny
test_manifest: data/manifest.tiny
min_input_len: 0.0
max_input_len: 30.0
min_output_len: 0.0
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
collator:
mean_std_filepath: data/mean_std.json
unit_type: char
vocab_filepath: data/vocab.txt
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
spectrum_type: linear
feat_dim:
delta_delta: False
stride_ms: 10.0
window_ms: 20.0
n_fft: None
max_freq: None
target_sample_rate: 16000
use_dB_normalization: True
target_dB: -20
dither: 1.0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 0
batch_size: 4
model:
num_conv_layers: 2
num_rnn_layers: 4
rnn_layer_size: 2048
rnn_direction: forward
num_fc_layers: 2
fc_layers_size_list: 512, 256
use_gru: True
blank_id: 0
ctc_grad_norm_type: instance
training:
n_epoch: 10
accum_grad: 1
lr: 1e-5
lr_decay: 1.0
weight_decay: 1e-06
global_grad_clip: 5.0
log_interval: 1
checkpoint:
kbest_n: 3
latest_n: 2
decoding:
batch_size: 128
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8
#! /usr/bin/env bash #!/bin/bash
. ${MAIN_ROOT}/utils/utility.sh . ${MAIN_ROOT}/utils/utility.sh
...@@ -9,6 +9,11 @@ URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm ...@@ -9,6 +9,11 @@ URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm
MD5="099a601759d467cd0a8523ff939819c5" MD5="099a601759d467cd0a8523ff939819c5"
TARGET=${DIR}/common_crawl_00.prune01111.trie.klm TARGET=${DIR}/common_crawl_00.prune01111.trie.klm
if [ -e $TARGET ];then
echo "$TARGET exists."
exit 0
fi
echo "Download language model ..." echo "Download language model ..."
download $URL $MD5 $TARGET download $URL $MD5 $TARGET
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
......
#! /usr/bin/env bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 4 ];then
echo "usage: $0 config_path ckpt_prefix jit_model_path" echo "usage: $0 config_path ckpt_prefix jit_model_path model_type"
exit -1 exit -1
fi fi
...@@ -11,19 +11,14 @@ echo "using $ngpu gpus..." ...@@ -11,19 +11,14 @@ echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_path_prefix=$2 ckpt_path_prefix=$2
jit_model_export_path=$3 jit_model_export_path=$3
model_type=$4
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
python3 -u ${BIN_DIR}/export.py \ python3 -u ${BIN_DIR}/export.py \
--device ${device} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--checkpoint_path ${ckpt_path_prefix} \ --checkpoint_path ${ckpt_path_prefix} \
--export_path ${jit_model_export_path} --export_path ${jit_model_export_path} \
--model_type ${model_type}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in export!" echo "Failed in export!"
......
#! /usr/bin/env bash #!/bin/bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path ckpt_path_prefix model_type"
exit -1 exit -1
fi fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
config_path=$1 config_path=$1
ckpt_prefix=$2 ckpt_prefix=$2
model_type=$3
# download language model # download language model
bash local/download_lm_en.sh bash local/download_lm_en.sh
...@@ -22,11 +19,11 @@ if [ $? -ne 0 ]; then ...@@ -22,11 +19,11 @@ if [ $? -ne 0 ]; then
fi fi
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--device ${device} \ --nproc ${ngpu} \
--nproc 1 \
--config ${config_path} \ --config ${config_path} \
--result_file ${ckpt_prefix}.rsl \ --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} --checkpoint_path ${ckpt_prefix} \
--model_type ${model_type}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
......
#! /usr/bin/env bash #!/bin/bash
if [ $# != 2 ];then profiler_options=
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
exit -1 # seed may break model convergence
fi seed=0
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 if [ ${seed} != 0 ]; then
ckpt_name=$2 export FLAGS_cudnn_deterministic=True
echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
fi
device=gpu if [ $# != 3 ];then
if [ ngpu == 0 ];then echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name model_type"
device=cpu exit -1
fi fi
config_path=$1
ckpt_name=$2
model_type=$3
mkdir -p exp mkdir -p exp
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--device ${device} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} --output exp/${ckpt_name} \
--model_type ${model_type} \
--profiler-options "${profiler_options}" \
--seed ${seed}
if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic
fi
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in training!" echo "Failed in training!"
......
export MAIN_ROOT=${PWD}/../../../ export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C export LC_ALL=C
......
...@@ -7,11 +7,12 @@ stage=0 ...@@ -7,11 +7,12 @@ stage=0
stop_stage=100 stop_stage=100
conf_path=conf/deepspeech2.yaml conf_path=conf/deepspeech2.yaml
avg_num=1 avg_num=1
model_type=offline
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
avg_ckpt=avg_${avg_num} avg_ckpt=avg_${avg_num}
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') ###ckpt = deepspeech2
echo "checkpoint name ${ckpt}" echo "checkpoint name ${ckpt}"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
...@@ -21,20 +22,20 @@ fi ...@@ -21,20 +22,20 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir # train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${model_type}
fi fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model # avg n best model
./local/avg.sh exp/${ckpt}/checkpoints ${avg_num} avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# export ckpt avg_n # export ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}
fi fi
...@@ -65,6 +65,8 @@ model: ...@@ -65,6 +65,8 @@ model:
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
......
#!/bin/bash
if [ $# != 2 ];then
echo "usage: ${0} config_path ckpt_path_prefix"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
ckpt_prefix=$2
batch_size=1
output_dir=${ckpt_prefix}
mkdir -p ${output_dir}
# align dump in `result_file`
# .tier, .TextGrid dump in `dir of result_file`
python3 -u ${BIN_DIR}/alignment.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${output_dir}/${type}.align \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in ctc alignment!"
exit 1
fi
exit 0
#! /usr/bin/env bash #!/bin/bash
stage=-1 stage=-1
stop_stage=100 stop_stage=100
......
#! /usr/bin/env bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 3 ];then
echo "usage: $0 config_path ckpt_prefix jit_model_path" echo "usage: $0 config_path ckpt_prefix jit_model_path"
...@@ -12,13 +12,7 @@ config_path=$1 ...@@ -12,13 +12,7 @@ config_path=$1
ckpt_path_prefix=$2 ckpt_path_prefix=$2
jit_model_export_path=$3 jit_model_export_path=$3
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
python3 -u ${BIN_DIR}/export.py \ python3 -u ${BIN_DIR}/export.py \
--device ${device} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--checkpoint_path ${ckpt_path_prefix} \ --checkpoint_path ${ckpt_path_prefix} \
......
#! /usr/bin/env bash #!/bin/bash
if [ $# != 2 ];then if [ $# != 2 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path ckpt_path_prefix"
...@@ -8,30 +8,57 @@ fi ...@@ -8,30 +8,57 @@ fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
config_path=$1 config_path=$1
ckpt_prefix=$2 ckpt_prefix=$2
chunk_mode=false
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
chunk_mode=true
fi
# download language model # download language model
#bash local/download_lm_en.sh #bash local/download_lm_en.sh
#if [ $? -ne 0 ]; then #if [ $? -ne 0 ]; then
# exit 1 # exit 1
#fi #fi
python3 -u ${BIN_DIR}/test.py \ for type in attention ctc_greedy_search; do
--device ${device} \ echo "decoding ${type}"
--nproc 1 \ if [ ${chunk_mode} == true ];then
--config ${config_path} \ # stream decoding only support batchsize=1
--result_file ${ckpt_prefix}.rsl \ batch_size=1
--checkpoint_path ${ckpt_prefix} else
batch_size=64
fi
python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \
--opts decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
exit 1 exit 1
fi fi
done
for type in ctc_prefix_beam_search attention_rescoring; do
echo "decoding ${type}"
batch_size=1
python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \
--opts decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
done
exit 0 exit 0
#! /usr/bin/env bash #!/bin/bash
profiler_options=
benchmark_batch_size=0
benchmark_max_step=0
# seed may break model convergence
seed=0
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True
echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
fi
if [ $# != 2 ];then if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
exit -1 exit -1
fi fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_name=$2 ckpt_name=$2
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
mkdir -p exp mkdir -p exp
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--device ${device} \ --seed ${seed} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} --output exp/${ckpt_name} \
--profiler-options "${profiler_options}" \
--benchmark-batch-size ${benchmark_batch_size} \
--benchmark-max-step ${benchmark_max_step}
if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic
fi
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in training!" echo "Failed in training!"
......
...@@ -20,20 +20,26 @@ fi ...@@ -20,20 +20,26 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir # train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=4,5,6,7 ./local/train.sh ${conf_path} ${ckpt} ./local/train.sh ${conf_path} ${ckpt}
fi fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model # avg n best model
./local/avg.sh exp/${ckpt}/checkpoints ${avg_num} avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES= ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# ctc alignment of test data
CUDA_VISIBLE_DEVICES= ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n # export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi fi
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册