提交 8e73d184 编写于 作者: H Hui Zhang

tiny/s0/s1 can run all

上级 6f83be1a
...@@ -18,8 +18,10 @@ import numpy as np ...@@ -18,8 +18,10 @@ import numpy as np
import paddle import paddle
from paddle.inference import Config from paddle.inference import Config
from paddle.inference import create_predictor from paddle.inference import create_predictor
from paddle.io import DataLoader
from deepspeech.exps.deepspeech2.config import get_cfg_defaults from deepspeech.exps.deepspeech2.config import get_cfg_defaults
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset from deepspeech.io.dataset import ManifestDataset
from deepspeech.models.ds2 import DeepSpeech2Model from deepspeech.models.ds2 import DeepSpeech2Model
from deepspeech.training.cli import default_argument_parser from deepspeech.training.cli import default_argument_parser
...@@ -78,26 +80,31 @@ def inference(config, args): ...@@ -78,26 +80,31 @@ def inference(config, args):
def start_server(config, args): def start_server(config, args):
"""Start the ASR server""" """Start the ASR server"""
config.defrost() config.defrost()
config.data.manfiest = config.data.test_manifest config.data.manifest = config.data.test_manifest
config.data.augmentation_config = ""
config.data.keep_transcription_text = True
dataset = ManifestDataset.from_config(config) dataset = ManifestDataset.from_config(config)
model = DeepSpeech2Model.from_pretrained(dataset, config, config.collator.augmentation_config = ""
config.collator.keep_transcription_text = True
config.collator.batch_size = 1
config.collator.num_workers = 0
collate_fn = SpeechCollator.from_config(config)
test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
model = DeepSpeech2Model.from_pretrained(test_loader, config,
args.checkpoint_path) args.checkpoint_path)
model.eval() model.eval()
# prepare ASR inference handler # prepare ASR inference handler
def file_to_transcript(filename): def file_to_transcript(filename):
feature = dataset.process_utterance(filename, "") feature = test_loader.collate_fn.process_utterance(filename, "")
audio = np.array([feature[0]]).astype('float32') #[1, D, T] audio = np.array([feature[0]]).astype('float32') #[1, T, D]
audio_len = feature[0].shape[1] audio_len = feature[0].shape[0]
audio_len = np.array([audio_len]).astype('int64') # [1] audio_len = np.array([audio_len]).astype('int64') # [1]
result_transcript = model.decode( result_transcript = model.decode(
paddle.to_tensor(audio), paddle.to_tensor(audio),
paddle.to_tensor(audio_len), paddle.to_tensor(audio_len),
vocab_list=dataset.vocab_list, vocab_list=test_loader.collate_fn.vocab_list,
decoding_method=config.decoding.decoding_method, decoding_method=config.decoding.decoding_method,
lang_model_path=config.decoding.lang_model_path, lang_model_path=config.decoding.lang_model_path,
beam_alpha=config.decoding.alpha, beam_alpha=config.decoding.alpha,
...@@ -138,7 +145,7 @@ if __name__ == "__main__": ...@@ -138,7 +145,7 @@ if __name__ == "__main__":
add_arg('host_ip', str, add_arg('host_ip', str,
'localhost', 'localhost',
"Server's IP address.") "Server's IP address.")
add_arg('host_port', int, 8086, "Server's IP port.") add_arg('host_port', int, 8089, "Server's IP port.")
add_arg('speech_save_dir', str, add_arg('speech_save_dir', str,
'demo_cache', 'demo_cache',
"Directory to save demo audios.") "Directory to save demo audios.")
......
...@@ -16,8 +16,10 @@ import functools ...@@ -16,8 +16,10 @@ import functools
import numpy as np import numpy as np
import paddle import paddle
from paddle.io import DataLoader
from deepspeech.exps.deepspeech2.config import get_cfg_defaults from deepspeech.exps.deepspeech2.config import get_cfg_defaults
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset from deepspeech.io.dataset import ManifestDataset
from deepspeech.models.ds2 import DeepSpeech2Model from deepspeech.models.ds2 import DeepSpeech2Model
from deepspeech.training.cli import default_argument_parser from deepspeech.training.cli import default_argument_parser
...@@ -31,26 +33,35 @@ from deepspeech.utils.utility import print_arguments ...@@ -31,26 +33,35 @@ from deepspeech.utils.utility import print_arguments
def start_server(config, args): def start_server(config, args):
"""Start the ASR server""" """Start the ASR server"""
config.defrost() config.defrost()
config.data.manfiest = config.data.test_manifest config.data.manifest = config.data.test_manifest
config.data.augmentation_config = ""
config.data.keep_transcription_text = True
dataset = ManifestDataset.from_config(config) dataset = ManifestDataset.from_config(config)
model = DeepSpeech2Model.from_pretrained(dataset, config, config.collator.augmentation_config = ""
config.collator.keep_transcription_text = True
config.collator.batch_size = 1
config.collator.num_workers = 0
collate_fn = SpeechCollator.from_config(config)
test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
model = DeepSpeech2Model.from_pretrained(test_loader, config,
args.checkpoint_path) args.checkpoint_path)
model.eval() model.eval()
# prepare ASR inference handler # prepare ASR inference handler
def file_to_transcript(filename): def file_to_transcript(filename):
feature = dataset.process_utterance(filename, "") feature = test_loader.collate_fn.process_utterance(filename, "")
audio = np.array([feature[0]]).astype('float32') #[1, D, T] audio = np.array([feature[0]]).astype('float32') #[1, T, D]
audio_len = feature[0].shape[1] # audio = audio.swapaxes(1,2)
print('---file_to_transcript feature----')
print(audio.shape)
audio_len = feature[0].shape[0]
print(audio_len)
audio_len = np.array([audio_len]).astype('int64') # [1] audio_len = np.array([audio_len]).astype('int64') # [1]
result_transcript = model.decode( result_transcript = model.decode(
paddle.to_tensor(audio), paddle.to_tensor(audio),
paddle.to_tensor(audio_len), paddle.to_tensor(audio_len),
vocab_list=dataset.vocab_list, vocab_list=test_loader.collate_fn.vocab_list,
decoding_method=config.decoding.decoding_method, decoding_method=config.decoding.decoding_method,
lang_model_path=config.decoding.lang_model_path, lang_model_path=config.decoding.lang_model_path,
beam_alpha=config.decoding.alpha, beam_alpha=config.decoding.alpha,
...@@ -91,7 +102,7 @@ if __name__ == "__main__": ...@@ -91,7 +102,7 @@ if __name__ == "__main__":
add_arg('host_ip', str, add_arg('host_ip', str,
'localhost', 'localhost',
"Server's IP address.") "Server's IP address.")
add_arg('host_port', int, 8086, "Server's IP port.") add_arg('host_port', int, 8088, "Server's IP port.")
add_arg('speech_save_dir', str, add_arg('speech_save_dir', str,
'demo_cache', 'demo_cache',
"Directory to save demo audios.") "Directory to save demo audios.")
......
...@@ -30,6 +30,9 @@ def main(config, args): ...@@ -30,6 +30,9 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = default_argument_parser() parser = default_argument_parser()
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
args = parser.parse_args() args = parser.parse_args()
print_arguments(args, globals()) print_arguments(args, globals())
......
...@@ -30,6 +30,9 @@ def main(config, args): ...@@ -30,6 +30,9 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = default_argument_parser() parser = default_argument_parser()
# save jit model to
parser.add_argument(
"--export_path", type=str, help="path of the jit model to save")
args = parser.parse_args() args = parser.parse_args()
print_arguments(args, globals()) print_arguments(args, globals())
......
...@@ -34,6 +34,9 @@ def main(config, args): ...@@ -34,6 +34,9 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = default_argument_parser() parser = default_argument_parser()
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
args = parser.parse_args() args = parser.parse_args()
print_arguments(args, globals()) print_arguments(args, globals())
......
...@@ -22,6 +22,8 @@ from deepspeech.exps.u2.model import U2Trainer as Trainer ...@@ -22,6 +22,8 @@ from deepspeech.exps.u2.model import U2Trainer as Trainer
from deepspeech.training.cli import default_argument_parser from deepspeech.training.cli import default_argument_parser
from deepspeech.utils.utility import print_arguments from deepspeech.utils.utility import print_arguments
# from deepspeech.exps.u2.trainer import U2Trainer as Trainer
def main_sp(config, args): def main_sp(config, args):
exp = Trainer(config, args) exp = Trainer(config, args)
...@@ -30,7 +32,7 @@ def main_sp(config, args): ...@@ -30,7 +32,7 @@ def main_sp(config, args):
def main(config, args): def main(config, args):
if args.device == "gpu" and args.nprocs > 1: if args.nprocs > 0:
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
else: else:
main_sp(config, args) main_sp(config, args)
......
...@@ -73,11 +73,11 @@ class U2Trainer(Trainer): ...@@ -73,11 +73,11 @@ class U2Trainer(Trainer):
def __init__(self, config, args): def __init__(self, config, args):
super().__init__(config, args) super().__init__(config, args)
def train_batch(self, batch_index, batch_data, msg): def train_batch(self, batch_index, batch, msg):
train_conf = self.config.training train_conf = self.config.training
start = time.time() start = time.time()
loss, attention_loss, ctc_loss = self.model(*batch_data) loss, attention_loss, ctc_loss = self.model(*batch)
# loss div by `batch_size * accum_grad` # loss div by `batch_size * accum_grad`
loss /= train_conf.accum_grad loss /= train_conf.accum_grad
loss.backward() loss.backward()
...@@ -219,7 +219,7 @@ class U2Trainer(Trainer): ...@@ -219,7 +219,7 @@ class U2Trainer(Trainer):
config.data.augmentation_config = "" config.data.augmentation_config = ""
dev_dataset = ManifestDataset.from_config(config) dev_dataset = ManifestDataset.from_config(config)
collate_fn = SpeechCollator(keep_transcription_text=False) collate_fn = SpeechCollator(keep_transcription_text=False, return_utts=False)
if self.parallel: if self.parallel:
batch_sampler = SortagradDistributedBatchSampler( batch_sampler = SortagradDistributedBatchSampler(
train_dataset, train_dataset,
...@@ -269,7 +269,7 @@ class U2Trainer(Trainer): ...@@ -269,7 +269,7 @@ class U2Trainer(Trainer):
batch_size=config.decoding.batch_size, batch_size=config.decoding.batch_size,
shuffle=False, shuffle=False,
drop_last=False, drop_last=False,
collate_fn=SpeechCollator(keep_transcription_text=True)) collate_fn=SpeechCollator(keep_transcription_text=True, return_utts=True))
logger.info("Setup train/valid/test Dataloader!") logger.info("Setup train/valid/test Dataloader!")
def setup_model(self): def setup_model(self):
...@@ -428,7 +428,7 @@ class U2Tester(U2Trainer): ...@@ -428,7 +428,7 @@ class U2Tester(U2Trainer):
num_time = 0.0 num_time = 0.0
with open(self.args.result_file, 'w') as fout: with open(self.args.result_file, 'w') as fout:
for i, batch in enumerate(self.test_loader): for i, batch in enumerate(self.test_loader):
metrics = self.compute_metrics(*batch, fout=fout) metrics = self.compute_metrics(*batch[:-1], fout=fout)
num_frames += metrics['num_frames'] num_frames += metrics['num_frames']
num_time += metrics["decode_time"] num_time += metrics["decode_time"]
errors_sum += metrics['errors_sum'] errors_sum += metrics['errors_sum']
...@@ -476,12 +476,12 @@ class U2Tester(U2Trainer): ...@@ -476,12 +476,12 @@ class U2Tester(U2Trainer):
}) })
f.write(data + '\n') f.write(data + '\n')
def run_test(self): # def run_test(self):
self.resume_or_scratch() # self.resume_or_scratch()
try: # try:
self.test() # self.test()
except KeyboardInterrupt: # except KeyboardInterrupt:
sys.exit(-1) # sys.exit(-1)
def load_inferspec(self): def load_inferspec(self):
"""infer model and input spec. """infer model and input spec.
...@@ -512,36 +512,36 @@ class U2Tester(U2Trainer): ...@@ -512,36 +512,36 @@ class U2Tester(U2Trainer):
logger.info(f"Export code: {static_model.forward.code}") logger.info(f"Export code: {static_model.forward.code}")
paddle.jit.save(static_model, self.args.export_path) paddle.jit.save(static_model, self.args.export_path)
def run_export(self): # def run_export(self):
try: # try:
self.export() # self.export()
except KeyboardInterrupt: # except KeyboardInterrupt:
sys.exit(-1) # sys.exit(-1)
def setup(self): # def setup(self):
"""Setup the experiment. # """Setup the experiment.
""" # """
paddle.set_device(self.args.device) # paddle.set_device(self.args.device)
self.setup_output_dir() # self.setup_output_dir()
self.setup_checkpointer() # self.setup_checkpointer()
self.setup_dataloader() # self.setup_dataloader()
self.setup_model() # self.setup_model()
self.iteration = 0 # self.iteration = 0
self.epoch = 0 # self.epoch = 0
def setup_output_dir(self): # def setup_output_dir(self):
"""Create a directory used for output. # """Create a directory used for output.
""" # """
# output dir # # output dir
if self.args.output: # if self.args.output:
output_dir = Path(self.args.output).expanduser() # output_dir = Path(self.args.output).expanduser()
output_dir.mkdir(parents=True, exist_ok=True) # output_dir.mkdir(parents=True, exist_ok=True)
else: # else:
output_dir = Path( # output_dir = Path(
self.args.checkpoint_path).expanduser().parent.parent # self.args.checkpoint_path).expanduser().parent.parent
output_dir.mkdir(parents=True, exist_ok=True) # output_dir.mkdir(parents=True, exist_ok=True)
self.output_dir = output_dir # self.output_dir = output_dir
...@@ -14,12 +14,27 @@ ...@@ -14,12 +14,27 @@
"""Contains the text featurizer class.""" """Contains the text featurizer class."""
import sentencepiece as spm import sentencepiece as spm
from deepspeech.frontend.utility import EOS from ..utility import EOS
from deepspeech.frontend.utility import UNK from ..utility import SPACE
from ..utility import UNK
from ..utility import SOS
from ..utility import BLANK
from ..utility import MASKCTC
from ..utility import load_dict
from deepspeech.utils.log import Log
class TextFeaturizer(object): logger = Log(__name__).getlog()
def __init__(self, unit_type, vocab_filepath, spm_model_prefix=None):
__all__ = ["TextFeaturizer"]
class TextFeaturizer():
def __init__(self,
unit_type,
vocab_filepath,
spm_model_prefix=None,
maskctc=False):
"""Text featurizer, for processing or extracting features from text. """Text featurizer, for processing or extracting features from text.
Currently, it supports char/word/sentence-piece level tokenizing and conversion into Currently, it supports char/word/sentence-piece level tokenizing and conversion into
...@@ -34,20 +49,21 @@ class TextFeaturizer(object): ...@@ -34,20 +49,21 @@ class TextFeaturizer(object):
assert unit_type in ('char', 'spm', 'word') assert unit_type in ('char', 'spm', 'word')
self.unit_type = unit_type self.unit_type = unit_type
self.unk = UNK self.unk = UNK
self.maskctc = maskctc
if vocab_filepath: if vocab_filepath:
self._vocab_dict, self._id2token, self._vocab_list = self._load_vocabulary_from_file( self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id = self._load_vocabulary_from_file(
vocab_filepath) vocab_filepath, maskctc)
self.unk_id = self._vocab_list.index(self.unk) self.vocab_size = len(self.vocab_list)
self.eos_id = self._vocab_list.index(EOS)
if unit_type == 'spm': if unit_type == 'spm':
spm_model = spm_model_prefix + '.model' spm_model = spm_model_prefix + '.model'
self.sp = spm.SentencePieceProcessor() self.sp = spm.SentencePieceProcessor()
self.sp.Load(spm_model) self.sp.Load(spm_model)
def tokenize(self, text): def tokenize(self, text, replace_space=True):
if self.unit_type == 'char': if self.unit_type == 'char':
tokens = self.char_tokenize(text) tokens = self.char_tokenize(text, replace_space)
elif self.unit_type == 'word': elif self.unit_type == 'word':
tokens = self.word_tokenize(text) tokens = self.word_tokenize(text)
else: # spm else: # spm
...@@ -67,7 +83,7 @@ class TextFeaturizer(object): ...@@ -67,7 +83,7 @@ class TextFeaturizer(object):
"""Convert text string to a list of token indices. """Convert text string to a list of token indices.
Args: Args:
text (str): Text to process. text (str): Text.
Returns: Returns:
List[int]: List of token indices. List[int]: List of token indices.
...@@ -75,8 +91,8 @@ class TextFeaturizer(object): ...@@ -75,8 +91,8 @@ class TextFeaturizer(object):
tokens = self.tokenize(text) tokens = self.tokenize(text)
ids = [] ids = []
for token in tokens: for token in tokens:
token = token if token in self._vocab_dict else self.unk token = token if token in self.vocab_dict else self.unk
ids.append(self._vocab_dict[token]) ids.append(self.vocab_dict[token])
return ids return ids
def defeaturize(self, idxs): def defeaturize(self, idxs):
...@@ -87,7 +103,7 @@ class TextFeaturizer(object): ...@@ -87,7 +103,7 @@ class TextFeaturizer(object):
idxs (List[int]): List of token indices. idxs (List[int]): List of token indices.
Returns: Returns:
str: Text to process. str: Text.
""" """
tokens = [] tokens = []
for idx in idxs: for idx in idxs:
...@@ -97,43 +113,22 @@ class TextFeaturizer(object): ...@@ -97,43 +113,22 @@ class TextFeaturizer(object):
text = self.detokenize(tokens) text = self.detokenize(tokens)
return text return text
@property def char_tokenize(self, text, replace_space=True):
def vocab_size(self):
"""Return the vocabulary size.
:return: Vocabulary size.
:rtype: int
"""
return len(self._vocab_list)
@property
def vocab_list(self):
"""Return the vocabulary in list.
Returns:
List[str]: tokens.
"""
return self._vocab_list
@property
def vocab_dict(self):
"""Return the vocabulary in dict.
Returns:
Dict[str, int]: token str -> int
"""
return self._vocab_dict
def char_tokenize(self, text):
"""Character tokenizer. """Character tokenizer.
Args: Args:
text (str): text string. text (str): text string.
replace_space (bool): False only used by build_vocab.py.
Returns: Returns:
List[str]: tokens. List[str]: tokens.
""" """
return list(text.strip()) text = text.strip()
if replace_space:
text_list = [SPACE if item == " " else item for item in list(text)]
else:
text_list = list(text)
return text_list
def char_detokenize(self, tokens): def char_detokenize(self, tokens):
"""Character detokenizer. """Character detokenizer.
...@@ -144,6 +139,7 @@ class TextFeaturizer(object): ...@@ -144,6 +139,7 @@ class TextFeaturizer(object):
Returns: Returns:
str: text string. str: text string.
""" """
tokens = tokens.replace(SPACE, " ")
return "".join(tokens) return "".join(tokens)
def word_tokenize(self, text): def word_tokenize(self, text):
...@@ -206,14 +202,28 @@ class TextFeaturizer(object): ...@@ -206,14 +202,28 @@ class TextFeaturizer(object):
return decode(tokens) return decode(tokens)
def _load_vocabulary_from_file(self, vocab_filepath): def _load_vocabulary_from_file(self, vocab_filepath: str, maskctc: bool):
"""Load vocabulary from file.""" """Load vocabulary from file."""
vocab_lines = [] vocab_list = load_dict(vocab_filepath, maskctc)
with open(vocab_filepath, 'r', encoding='utf-8') as file: assert vocab_list is not None
vocab_lines.extend(file.readlines()) logger.info(f"Vocab: {vocab_list}")
vocab_list = [line[:-1] for line in vocab_lines]
id2token = dict( id2token = dict(
[(idx, token) for (idx, token) in enumerate(vocab_list)]) [(idx, token) for (idx, token) in enumerate(vocab_list)])
token2id = dict( token2id = dict(
[(token, idx) for (idx, token) in enumerate(vocab_list)]) [(token, idx) for (idx, token) in enumerate(vocab_list)])
return token2id, id2token, vocab_list
blank_id = vocab_list.index(BLANK) if BLANK in vocab_list else -1
maskctc_id = vocab_list.index(MASKCTC) if MASKCTC in vocab_list else -1
unk_id = vocab_list.index(UNK) if UNK in vocab_list else -1
eos_id = vocab_list.index(EOS) if EOS in vocab_list else -1
sos_id = vocab_list.index(SOS) if SOS in vocab_list else -1
space_id = vocab_list.index(SPACE) if SPACE in vocab_list else -1
logger.info(f"UNK id: {unk_id}")
logger.info(f"EOS id: {eos_id}")
logger.info(f"SOS id: {sos_id}")
logger.info(f"SPACE id: {space_id}")
logger.info(f"BLANK id: {blank_id}")
logger.info(f"MASKCTC id: {maskctc_id}")
return token2id, id2token, vocab_list, unk_id, eos_id
...@@ -12,10 +12,15 @@ ...@@ -12,10 +12,15 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Contains data helper functions.""" """Contains data helper functions."""
import codecs
import json import json
import math import math
import tarfile
from collections import namedtuple
from typing import List
from typing import Optional
from typing import Text
import jsonlines
import numpy as np import numpy as np
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
...@@ -23,16 +28,40 @@ from deepspeech.utils.log import Log ...@@ -23,16 +28,40 @@ from deepspeech.utils.log import Log
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
__all__ = [ __all__ = [
"load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs", "max_dbfs", "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
"mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS", "EOS", "UNK", "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
"BLANK" "EOS", "UNK", "BLANK", "MASKCTC", "SPACE"
] ]
IGNORE_ID = -1 IGNORE_ID = -1
SOS = "<sos/eos>" # `sos` and `eos` using same token
SOS = "<eos>"
EOS = SOS EOS = SOS
UNK = "<unk>" UNK = "<unk>"
BLANK = "<blank>" BLANK = "<blank>"
MASKCTC = "<mask>"
SPACE = "<space>"
def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
if dict_path is None:
return None
with open(dict_path, "r") as f:
dictionary = f.readlines()
# first token is `<blank>`
# multi line: `<blank> 0\n`
# one line: `<blank>`
# space is relpace with <space>
char_list = [entry[:-1].split(" ")[0] for entry in dictionary]
if BLANK not in char_list:
char_list.insert(0, BLANK)
if EOS not in char_list:
char_list.append(EOS)
# for non-autoregressive maskctc model
if maskctc and MASKCTC not in char_list:
char_list.append(MASKCTC)
return char_list
def read_manifest( def read_manifest(
...@@ -47,12 +76,20 @@ def read_manifest( ...@@ -47,12 +76,20 @@ def read_manifest(
Args: Args:
manifest_path ([type]): Manifest file to load and parse. manifest_path ([type]): Manifest file to load and parse.
max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf'). max_input_len ([type], optional): maximum output seq length,
min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0. in seconds for raw wav, in frame numbers for feature data.
max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0. Defaults to float('inf').
min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0. min_input_len (float, optional): minimum input seq length,
max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0. in seconds for raw wav, in frame numbers for feature data.
min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05. Defaults to 0.0.
max_output_len (float, optional): maximum input seq length,
in modeling units. Defaults to 500.0.
min_output_len (float, optional): minimum input seq length,
in modeling units. Defaults to 0.0.
max_output_input_ratio (float, optional):
maximum output seq length/output seq length ratio. Defaults to 10.0.
min_output_input_ratio (float, optional):
minimum output seq length/output seq length ratio. Defaults to 0.05.
Raises: Raises:
IOError: If failed to parse the manifest. IOError: If failed to parse the manifest.
...@@ -62,12 +99,8 @@ def read_manifest( ...@@ -62,12 +99,8 @@ def read_manifest(
""" """
manifest = [] manifest = []
for json_line in codecs.open(manifest_path, 'r', 'utf-8'): with jsonlines.open(manifest_path, 'r') as reader:
try: for json_data in reader:
json_data = json.loads(json_line)
except Exception as e:
raise IOError("Error reading manifest: %s" % str(e))
feat_len = json_data["feat_shape"][ feat_len = json_data["feat_shape"][
0] if 'feat_shape' in json_data else 1.0 0] if 'feat_shape' in json_data else 1.0
token_len = json_data["token_shape"][ token_len = json_data["token_shape"][
...@@ -85,6 +118,51 @@ def read_manifest( ...@@ -85,6 +118,51 @@ def read_manifest(
return manifest return manifest
# Tar File read
TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
def parse_tar(file):
"""Parse a tar file to get a tarfile object
and a map containing tarinfoes
"""
result = {}
f = tarfile.open(file)
for tarinfo in f.getmembers():
result[tarinfo.name] = tarinfo
return f, result
def subfile_from_tar(file, local_data=None):
"""Get subfile object from tar.
tar:tarpath#filename
It will return a subfile object from tar file
and cached tar file info for next reading request.
"""
tarpath, filename = file.split(':', 1)[1].split('#', 1)
if local_data is None:
local_data = TarLocalData(tar2info={}, tar2object={})
assert isinstance(local_data, TarLocalData)
if 'tar2info' not in local_data.__dict__:
local_data.tar2info = {}
if 'tar2object' not in local_data.__dict__:
local_data.tar2object = {}
if tarpath not in local_data.tar2info:
fobj, infos = parse_tar(tarpath)
local_data.tar2info[tarpath] = infos
local_data.tar2object[tarpath] = fobj
else:
fobj = local_data.tar2object[tarpath]
infos = local_data.tar2info[tarpath]
return fobj.extractfile(infos[filename])
def rms_to_db(rms: float): def rms_to_db(rms: float):
"""Root Mean Square to dB. """Root Mean Square to dB.
...@@ -254,6 +332,13 @@ def load_cmvn(cmvn_file: str, filetype: str): ...@@ -254,6 +332,13 @@ def load_cmvn(cmvn_file: str, filetype: str):
cmvn = _load_json_cmvn(cmvn_file) cmvn = _load_json_cmvn(cmvn_file)
elif filetype == "kaldi": elif filetype == "kaldi":
cmvn = _load_kaldi_cmvn(cmvn_file) cmvn = _load_kaldi_cmvn(cmvn_file)
elif filetype == "npz":
eps = 1e-14
npzfile = np.load(cmvn_file)
mean = np.squeeze(npzfile["mean"])
std = np.squeeze(npzfile["std"])
istd = 1 / (std + eps)
cmvn = [mean, istd]
else: else:
raise ValueError(f"cmvn file type no support: {filetype}") raise ValueError(f"cmvn file type no support: {filetype}")
return cmvn[0], cmvn[1] return cmvn[0], cmvn[1]
...@@ -23,7 +23,7 @@ logger = Log(__name__).getlog() ...@@ -23,7 +23,7 @@ logger = Log(__name__).getlog()
class SpeechCollator(): class SpeechCollator():
def __init__(self, keep_transcription_text=True): def __init__(self, keep_transcription_text=True, return_utts=False):
""" """
Padding audio features with zeros to make them have the same shape (or Padding audio features with zeros to make them have the same shape (or
a user-defined shape) within one bach. a user-defined shape) within one bach.
...@@ -31,6 +31,7 @@ class SpeechCollator(): ...@@ -31,6 +31,7 @@ class SpeechCollator():
if ``keep_transcription_text`` is False, text is token ids else is raw string. if ``keep_transcription_text`` is False, text is token ids else is raw string.
""" """
self._keep_transcription_text = keep_transcription_text self._keep_transcription_text = keep_transcription_text
self.return_utts = return_utts
def __call__(self, batch): def __call__(self, batch):
"""batch examples """batch examples
...@@ -51,7 +52,9 @@ class SpeechCollator(): ...@@ -51,7 +52,9 @@ class SpeechCollator():
audio_lens = [] audio_lens = []
texts = [] texts = []
text_lens = [] text_lens = []
for audio, text in batch: utts = []
for utt, audio, text in batch:
utts.append(utt)
# audio # audio
audios.append(audio.T) # [T, D] audios.append(audio.T) # [T, D]
audio_lens.append(audio.shape[1]) audio_lens.append(audio.shape[1])
...@@ -75,4 +78,7 @@ class SpeechCollator(): ...@@ -75,4 +78,7 @@ class SpeechCollator():
padded_texts = pad_sequence( padded_texts = pad_sequence(
texts, padding_value=IGNORE_ID).astype(np.int64) texts, padding_value=IGNORE_ID).astype(np.int64)
text_lens = np.array(text_lens).astype(np.int64) text_lens = np.array(text_lens).astype(np.int64)
if self.return_utts:
return padded_audios, audio_lens, padded_texts, text_lens, utts
else:
return padded_audios, audio_lens, padded_texts, text_lens return padded_audios, audio_lens, padded_texts, text_lens
\ No newline at end of file
...@@ -347,4 +347,5 @@ class ManifestDataset(Dataset): ...@@ -347,4 +347,5 @@ class ManifestDataset(Dataset):
def __getitem__(self, idx): def __getitem__(self, idx):
instance = self._manifest[idx] instance = self._manifest[idx]
return self.process_utterance(instance["feat"], instance["text"]) feat, text = self.process_utterance(instance["feat"], instance["text"])
return instance["utt"], feat, text
...@@ -114,7 +114,7 @@ class ConvBn(nn.Layer): ...@@ -114,7 +114,7 @@ class ConvBn(nn.Layer):
masks = make_non_pad_mask(x_len) #[B, T] masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]
# TODO(Hui Zhang): not support bool multiply # TODO(Hui Zhang): not support bool multiply
masks = masks.type_as(x) masks = masks.astype(x.dtype)
x = x.multiply(masks) x = x.multiply(masks)
return x, x_len return x, x_len
......
...@@ -219,15 +219,17 @@ class DeepSpeech2Model(nn.Layer): ...@@ -219,15 +219,17 @@ class DeepSpeech2Model(nn.Layer):
The model built from pretrained result. The model built from pretrained result.
""" """
model = cls( model = cls(
feat_size=dataloader.collate_fn.feature_size, #feat_size=dataloader.collate_fn.feature_size,
dict_size=dataloader.collate_fn.vocab_size, feat_size=dataloader.dataset.feature_size,
#dict_size=dataloader.collate_fn.vocab_size,
dict_size=dataloader.dataset.vocab_size,
num_conv_layers=config.model.num_conv_layers, num_conv_layers=config.model.num_conv_layers,
num_rnn_layers=config.model.num_rnn_layers, num_rnn_layers=config.model.num_rnn_layers,
rnn_size=config.model.rnn_layer_size, rnn_size=config.model.rnn_layer_size,
use_gru=config.model.use_gru, use_gru=config.model.use_gru,
share_rnn_weights=config.model.share_rnn_weights, share_rnn_weights=config.model.share_rnn_weights,
blank_id=config.model.blank_id, blank_id=config.model.blank_id,
ctc_grad_norm_type=config.ctc_grad_norm_type, ) ctc_grad_norm_type=config.model.ctc_grad_norm_type, )
infos = Checkpoint().load_parameters( infos = Checkpoint().load_parameters(
model, checkpoint_path=checkpoint_path) model, checkpoint_path=checkpoint_path)
logger.info(f"checkpoint info: {infos}") logger.info(f"checkpoint info: {infos}")
...@@ -260,24 +262,8 @@ class DeepSpeech2Model(nn.Layer): ...@@ -260,24 +262,8 @@ class DeepSpeech2Model(nn.Layer):
class DeepSpeech2InferModel(DeepSpeech2Model): class DeepSpeech2InferModel(DeepSpeech2Model):
def __init__(self, def __init__(self, *args, **kwargs):
feat_size, super().__init__(*args, **kwargs)
dict_size,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=1024,
use_gru=False,
share_rnn_weights=True,
blank_id=0):
super().__init__(
feat_size=feat_size,
dict_size=dict_size,
num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers,
rnn_size=rnn_size,
use_gru=use_gru,
share_rnn_weights=share_rnn_weights,
blank_id=blank_id)
def forward(self, audio, audio_len): def forward(self, audio, audio_len):
"""export model function """export model function
......
...@@ -309,6 +309,6 @@ class RNNStack(nn.Layer): ...@@ -309,6 +309,6 @@ class RNNStack(nn.Layer):
masks = make_non_pad_mask(x_len) #[B, T] masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(-1) # [B, T, 1] masks = masks.unsqueeze(-1) # [B, T, 1]
# TODO(Hui Zhang): not support bool multiply # TODO(Hui Zhang): not support bool multiply
masks = masks.type_as(x) masks = masks.astype(x.dtype)
x = x.multiply(masks) x = x.multiply(masks)
return x, x_len return x, x_len
...@@ -255,12 +255,13 @@ class DeepSpeech2ModelOnline(nn.Layer): ...@@ -255,12 +255,13 @@ class DeepSpeech2ModelOnline(nn.Layer):
fc_layers_size_list=[512, 256], fc_layers_size_list=[512, 256],
use_gru=True, #Use gru if set True. Use simple rnn if set False. use_gru=True, #Use gru if set True. Use simple rnn if set False.
blank_id=0, # index of blank in vocob.txt blank_id=0, # index of blank in vocob.txt
)) ctc_grad_norm_type='instance', ))
if config is not None: if config is not None:
config.merge_from_other_cfg(default) config.merge_from_other_cfg(default)
return default return default
def __init__(self, def __init__(
self,
feat_size, feat_size,
dict_size, dict_size,
num_conv_layers=2, num_conv_layers=2,
...@@ -270,7 +271,8 @@ class DeepSpeech2ModelOnline(nn.Layer): ...@@ -270,7 +271,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
num_fc_layers=2, num_fc_layers=2,
fc_layers_size_list=[512, 256], fc_layers_size_list=[512, 256],
use_gru=False, use_gru=False,
blank_id=0): blank_id=0,
ctc_grad_norm_type='instance', ):
super().__init__() super().__init__()
self.encoder = CRNNEncoder( self.encoder = CRNNEncoder(
feat_size=feat_size, feat_size=feat_size,
...@@ -290,7 +292,7 @@ class DeepSpeech2ModelOnline(nn.Layer): ...@@ -290,7 +292,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
dropout_rate=0.0, dropout_rate=0.0,
reduction=True, # sum reduction=True, # sum
batch_average=True, # sum / batch_size batch_average=True, # sum / batch_size
grad_norm_type='instance') grad_norm_type=ctc_grad_norm_type)
def forward(self, audio, audio_len, text, text_len): def forward(self, audio, audio_len, text, text_len):
"""Compute Model loss """Compute Model loss
...@@ -348,7 +350,8 @@ class DeepSpeech2ModelOnline(nn.Layer): ...@@ -348,7 +350,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
DeepSpeech2ModelOnline DeepSpeech2ModelOnline
The model built from pretrained result. The model built from pretrained result.
""" """
model = cls(feat_size=dataloader.collate_fn.feature_size, model = cls(
feat_size=dataloader.collate_fn.feature_size,
dict_size=dataloader.collate_fn.vocab_size, dict_size=dataloader.collate_fn.vocab_size,
num_conv_layers=config.model.num_conv_layers, num_conv_layers=config.model.num_conv_layers,
num_rnn_layers=config.model.num_rnn_layers, num_rnn_layers=config.model.num_rnn_layers,
...@@ -357,7 +360,8 @@ class DeepSpeech2ModelOnline(nn.Layer): ...@@ -357,7 +360,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
num_fc_layers=config.model.num_fc_layers, num_fc_layers=config.model.num_fc_layers,
fc_layers_size_list=config.model.fc_layers_size_list, fc_layers_size_list=config.model.fc_layers_size_list,
use_gru=config.model.use_gru, use_gru=config.model.use_gru,
blank_id=config.model.blank_id) blank_id=config.model.blank_id,
ctc_grad_norm_type=config.model.ctc_grad_norm_type, )
infos = Checkpoint().load_parameters( infos = Checkpoint().load_parameters(
model, checkpoint_path=checkpoint_path) model, checkpoint_path=checkpoint_path)
logger.info(f"checkpoint info: {infos}") logger.info(f"checkpoint info: {infos}")
...@@ -376,7 +380,8 @@ class DeepSpeech2ModelOnline(nn.Layer): ...@@ -376,7 +380,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
DeepSpeech2ModelOnline DeepSpeech2ModelOnline
The model built from config. The model built from config.
""" """
model = cls(feat_size=config.feat_size, model = cls(
feat_size=config.feat_size,
dict_size=config.dict_size, dict_size=config.dict_size,
num_conv_layers=config.num_conv_layers, num_conv_layers=config.num_conv_layers,
num_rnn_layers=config.num_rnn_layers, num_rnn_layers=config.num_rnn_layers,
...@@ -385,33 +390,14 @@ class DeepSpeech2ModelOnline(nn.Layer): ...@@ -385,33 +390,14 @@ class DeepSpeech2ModelOnline(nn.Layer):
num_fc_layers=config.num_fc_layers, num_fc_layers=config.num_fc_layers,
fc_layers_size_list=config.fc_layers_size_list, fc_layers_size_list=config.fc_layers_size_list,
use_gru=config.use_gru, use_gru=config.use_gru,
blank_id=config.blank_id) blank_id=config.blank_id,
ctc_grad_norm_type=config.ctc_grad_norm_type, )
return model return model
class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
def __init__(self, def __init__(self, *args, **kwargs):
feat_size, super().__init__(*args, **kwargs)
dict_size,
num_conv_layers=2,
num_rnn_layers=4,
rnn_size=1024,
rnn_direction='forward',
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=False,
blank_id=0):
super().__init__(
feat_size=feat_size,
dict_size=dict_size,
num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers,
rnn_size=rnn_size,
rnn_direction=rnn_direction,
num_fc_layers=num_fc_layers,
fc_layers_size_list=fc_layers_size_list,
use_gru=use_gru,
blank_id=blank_id)
def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box, def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box,
chunk_state_c_box): chunk_state_c_box):
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import time import time
from contextlib import contextmanager
from pathlib import Path from pathlib import Path
import paddle import paddle
...@@ -78,7 +79,7 @@ class Trainer(): ...@@ -78,7 +79,7 @@ class Trainer():
>>> config.merge_from_list(args.opts) >>> config.merge_from_list(args.opts)
>>> config.freeze() >>> config.freeze()
>>> >>>
>>> if args.nprocs > 1 and args.device == "gpu": >>> if args.nprocs > 0:
>>> dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) >>> dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
>>> else: >>> else:
>>> main_sp(config, args) >>> main_sp(config, args)
...@@ -93,18 +94,24 @@ class Trainer(): ...@@ -93,18 +94,24 @@ class Trainer():
self.checkpoint_dir = None self.checkpoint_dir = None
self.iteration = 0 self.iteration = 0
self.epoch = 0 self.epoch = 0
self._train = True
def setup(self): paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
"""Setup the experiment.
"""
paddle.set_device(self.args.device)
if self.parallel: if self.parallel:
self.init_parallel() self.init_parallel()
@contextmanager
def eval(self):
self._train = False
yield
self._train = True
def setup(self):
"""Setup the experiment.
"""
self.setup_output_dir() self.setup_output_dir()
self.dump_config() self.dump_config()
self.setup_visualizer() self.setup_visualizer()
self.setup_checkpointer()
self.setup_dataloader() self.setup_dataloader()
self.setup_model() self.setup_model()
...@@ -117,7 +124,7 @@ class Trainer(): ...@@ -117,7 +124,7 @@ class Trainer():
"""A flag indicating whether the experiment should run with """A flag indicating whether the experiment should run with
multiprocessing. multiprocessing.
""" """
return self.args.device == "gpu" and self.args.nprocs > 1 return self.args.nprocs > 1
def init_parallel(self): def init_parallel(self):
"""Init environment for multiprocess training. """Init environment for multiprocess training.
...@@ -158,8 +165,8 @@ class Trainer(): ...@@ -158,8 +165,8 @@ class Trainer():
checkpoint_path=self.args.checkpoint_path) checkpoint_path=self.args.checkpoint_path)
if infos: if infos:
# restore from ckpt # restore from ckpt
self.iteration = infos["step"] self.iteration = infos["step"] + 1
self.epoch = infos["epoch"] self.epoch = infos["epoch"] + 1
scratch = False scratch = False
else: else:
self.iteration = 0 self.iteration = 0
...@@ -237,31 +244,61 @@ class Trainer(): ...@@ -237,31 +244,61 @@ class Trainer():
try: try:
self.train() self.train()
except KeyboardInterrupt: except KeyboardInterrupt:
self.save()
exit(-1) exit(-1)
finally: finally:
self.destory() self.destory()
logger.info("Training Done.") logger.info("Train Done.")
def run_test(self):
"""Do Test/Decode"""
with self.eval():
self.resume_or_scratch()
try:
self.test()
except KeyboardInterrupt:
exit(-1)
logger.info("Test/Decode Done.")
def run_export(self):
"""Do Model Export"""
with self.eval():
try:
self.export()
except KeyboardInterrupt:
exit(-1)
logger.info("Export Done.")
def setup_output_dir(self): def setup_output_dir(self):
"""Create a directory used for output. """Create a directory used for output.
""" """
# output dir if self.args.output:
output_dir = Path(self.args.output).expanduser() output_dir = Path(self.args.output).expanduser()
output_dir.mkdir(parents=True, exist_ok=True) elif self.args.checkpoint_path:
output_dir = Path(
self.args.checkpoint_path).expanduser().parent.parent
self.output_dir = output_dir self.output_dir = output_dir
self.output_dir.mkdir(parents=True, exist_ok=True)
def setup_checkpointer(self): self.checkpoint_dir = self.output_dir / "checkpoints"
"""Create a directory used to save checkpoints into. self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
It is "checkpoints" inside the output directory. self.log_dir = output_dir / "log"
""" self.log_dir.mkdir(parents=True, exist_ok=True)
# checkpoint dir
checkpoint_dir = self.output_dir / "checkpoints" self.test_dir = output_dir / "test"
checkpoint_dir.mkdir(exist_ok=True) self.test_dir.mkdir(parents=True, exist_ok=True)
self.decode_dir = output_dir / "decode"
self.decode_dir.mkdir(parents=True, exist_ok=True)
self.export_dir = output_dir / "export"
self.export_dir.mkdir(parents=True, exist_ok=True)
self.visual_dir = output_dir / "visual"
self.visual_dir.mkdir(parents=True, exist_ok=True)
self.checkpoint_dir = checkpoint_dir self.config_dir = output_dir / "conf"
self.config_dir.mkdir(parents=True, exist_ok=True)
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
def destory(self): def destory(self):
...@@ -283,7 +320,7 @@ class Trainer(): ...@@ -283,7 +320,7 @@ class Trainer():
unexpected behaviors. unexpected behaviors.
""" """
# visualizer # visualizer
visualizer = SummaryWriter(logdir=str(self.output_dir)) visualizer = SummaryWriter(logdir=str(self.visual_dir))
self.visualizer = visualizer self.visualizer = visualizer
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
...@@ -293,7 +330,14 @@ class Trainer(): ...@@ -293,7 +330,14 @@ class Trainer():
It is saved in to ``config.yaml`` in the output directory at the It is saved in to ``config.yaml`` in the output directory at the
beginning of the experiment. beginning of the experiment.
""" """
with open(self.output_dir / "config.yaml", 'wt') as f: config_file = self.config_dir / "config.yaml"
if self._train and config_file.exists():
time_stamp = time.strftime("%Y_%m_%d_%H_%M_%s", time.gmtime())
target_path = self.config_dir / ".".join(
[time_stamp, "config.yaml"])
config_file.rename(target_path)
with open(config_file, 'wt') as f:
print(self.config, file=f) print(self.config, file=f)
def train_batch(self): def train_batch(self):
...@@ -307,6 +351,18 @@ class Trainer(): ...@@ -307,6 +351,18 @@ class Trainer():
""" """
raise NotImplementedError("valid should be implemented.") raise NotImplementedError("valid should be implemented.")
@paddle.no_grad()
def test(self):
"""The test. A subclass should implement this method in Tester.
"""
raise NotImplementedError("test should be implemented.")
@paddle.no_grad()
def export(self):
"""The test. A subclass should implement this method in Tester.
"""
raise NotImplementedError("export should be implemented.")
def setup_model(self): def setup_model(self):
"""Setup model, criterion and optimizer, etc. A subclass should """Setup model, criterion and optimizer, etc. A subclass should
implement this method. implement this method.
......
...@@ -120,14 +120,15 @@ class Autolog: ...@@ -120,14 +120,15 @@ class Autolog:
model_precision="fp32"): model_precision="fp32"):
import auto_log import auto_log
pid = os.getpid() pid = os.getpid()
if (os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''): if os.environ.get('CUDA_VISIBLE_DEVICES', None):
gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]) gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0])
infer_config = inference.Config() infer_config = inference.Config()
infer_config.enable_use_gpu(100, gpu_id) infer_config.enable_use_gpu(100, gpu_id)
else: else:
gpu_id = None gpu_id = None
infer_config = inference.Config() infer_config = inference.Config()
autolog = auto_log.AutoLogger(
self.autolog = auto_log.AutoLogger(
model_name=model_name, model_name=model_name,
model_precision=model_precision, model_precision=model_precision,
batch_size=batch_size, batch_size=batch_size,
...@@ -139,7 +140,6 @@ class Autolog: ...@@ -139,7 +140,6 @@ class Autolog:
gpu_ids=gpu_id, gpu_ids=gpu_id,
time_keys=['preprocess_time', 'inference_time', 'postprocess_time'], time_keys=['preprocess_time', 'inference_time', 'postprocess_time'],
warmup=0) warmup=0)
self.autolog = autolog
def getlog(self): def getlog(self):
return self.autolog return self.autolog
...@@ -2,3 +2,4 @@ dev-clean/ ...@@ -2,3 +2,4 @@ dev-clean/
manifest.dev-clean manifest.dev-clean
manifest.train-clean manifest.train-clean
train-clean/ train-clean/
*.meta
...@@ -58,6 +58,10 @@ def create_manifest(data_dir, manifest_path): ...@@ -58,6 +58,10 @@ def create_manifest(data_dir, manifest_path):
""" """
print("Creating manifest %s ..." % manifest_path) print("Creating manifest %s ..." % manifest_path)
json_lines = [] json_lines = []
total_sec = 0.0
total_text = 0.0
total_num = 0
for subfolder, _, filelist in sorted(os.walk(data_dir)): for subfolder, _, filelist in sorted(os.walk(data_dir)):
text_filelist = [ text_filelist = [
filename for filename in filelist if filename.endswith('trans.txt') filename for filename in filelist if filename.endswith('trans.txt')
...@@ -80,10 +84,27 @@ def create_manifest(data_dir, manifest_path): ...@@ -80,10 +84,27 @@ def create_manifest(data_dir, manifest_path):
'text': 'text':
text text
})) }))
total_sec += duration
total_text += len(text)
total_num += 1
with codecs.open(manifest_path, 'w', 'utf-8') as out_file: with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
for line in json_lines: for line in json_lines:
out_file.write(line + '\n') out_file.write(line + '\n')
subset = os.path.splitext(manifest_path)[1][1:]
manifest_dir = os.path.dirname(manifest_path)
data_dir_name = os.path.split(data_dir)[-1]
meta_path = os.path.join(manifest_dir, data_dir_name) + '.meta'
with open(meta_path, 'w') as f:
print(f"{subset}:", file=f)
print(f"{total_num} utts", file=f)
print(f"{total_sec / (60*60)} h", file=f)
print(f"{total_text} text", file=f)
print(f"{total_text / total_sec} text/sec", file=f)
print(f"{total_sec / total_num} sec/utt", file=f)
def prepare_dataset(url, md5sum, target_dir, manifest_path): def prepare_dataset(url, md5sum, target_dir, manifest_path):
"""Download, unpack and create summmary manifest file. """Download, unpack and create summmary manifest file.
......
#!/bin/bash
if [ $# != 2 ];then
echo "usage: ${0} config_path ckpt_path_prefix"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
ckpt_prefix=$2
batch_size=1
output_dir=${ckpt_prefix}
mkdir -p ${output_dir}
# align dump in `result_file`
# .tier, .TextGrid dump in `dir of result_file`
python3 -u ${BIN_DIR}/alignment.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${output_dir}/${type}.align \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in ctc alignment!"
exit 1
fi
exit 0
#! /usr/bin/env bash #!/bin/bash
stage=-1 stage=-1
stop_stage=100 stop_stage=100
......
#! /usr/bin/env bash #!/bin/bash
. ${MAIN_ROOT}/utils/utility.sh . ${MAIN_ROOT}/utils/utility.sh
......
#! /usr/bin/env bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 3 ];then
echo "usage: $0 config_path ckpt_prefix jit_model_path" echo "usage: $0 config_path ckpt_prefix jit_model_path"
...@@ -12,13 +12,7 @@ config_path=$1 ...@@ -12,13 +12,7 @@ config_path=$1
ckpt_path_prefix=$2 ckpt_path_prefix=$2
jit_model_export_path=$3 jit_model_export_path=$3
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
python3 -u ${BIN_DIR}/export.py \ python3 -u ${BIN_DIR}/export.py \
--device ${device} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--checkpoint_path ${ckpt_path_prefix} \ --checkpoint_path ${ckpt_path_prefix} \
......
#! /usr/bin/env bash #!/bin/bash
if [ $# != 2 ];then set -e
echo "usage: ${0} config_path ckpt_path_prefix"
expdir=exp
datadir=data
nj=32
lmtag=
recog_set="test-clean test-other dev-clean dev-other"
recog_set="test-clean"
# bpemode (unigram or bpe)
nbpe=5000
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
bpemodel=${bpeprefix}.model
if [ $# != 3 ];then
echo "usage: ${0} config_path dict_path ckpt_path_prefix"
exit -1 exit -1
fi fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
config_path=$1 config_path=$1
ckpt_prefix=$2 dict=$2
ckpt_prefix=$3
chunk_mode=false
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
chunk_mode=true
fi
echo "chunk mode ${chunk_mode}"
# download language model # download language model
#bash local/download_lm_en.sh #bash local/download_lm_en.sh
...@@ -21,39 +42,46 @@ ckpt_prefix=$2 ...@@ -21,39 +42,46 @@ ckpt_prefix=$2
# exit 1 # exit 1
#fi #fi
for type in attention ctc_greedy_search; do pids=() # initialize pids
echo "decoding ${type}"
batch_size=64
python3 -u ${BIN_DIR}/test.py \
--device ${device} \
--nproc 1 \
--config ${config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do
echo "Failed in evaluation!" (
exit 1 for rtask in ${recog_set}; do
fi (
done decode_dir=decode_${rtask}_${dmethd}_$(basename ${config_path%.*})_${lmtag}
feat_recog_dir=${datadir}
mkdir -p ${expdir}/${decode_dir}
mkdir -p ${feat_recog_dir}
# split data
split_json.sh ${feat_recog_dir}/manifest.${rtask} ${nj}
for type in ctc_prefix_beam_search attention_rescoring; do #### use CPU for decoding
echo "decoding ${type}" ngpu=0
# set batchsize 0 to disable batch decoding
batch_size=1 batch_size=1
${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--device ${device} \ --nproc ${ngpu} \
--nproc 1 \
--config ${config_path} \ --config ${config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \ --result_file ${expdir}/${decode_dir}/data.JOB.json \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} decoding.batch_size ${batch_size} --opts decoding.decoding_method ${dmethd} \
--opts decoding.batch_size ${batch_size} \
--opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask}
score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
if [ $? -ne 0 ]; then ) &
echo "Failed in evaluation!" pids+=($!) # store background pids
exit 1 done
fi ) &
pids+=($!) # store background pids
done done
i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
[ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
echo "Finished"
exit 0 exit 0
#! /usr/bin/env bash #!/bin/bash
if [ $# != 2 ];then if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
...@@ -11,19 +11,28 @@ echo "using $ngpu gpus..." ...@@ -11,19 +11,28 @@ echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_name=$2 ckpt_name=$2
device=gpu mkdir -p exp
if [ ngpu == 0 ];then
device=cpu # seed may break model convergence
seed=0
if [ ${seed} != 0 ]; then
#export FLAGS_cudnn_deterministic=True
echo "None"
fi fi
echo "using ${device}..."
mkdir -p exp # export FLAGS_cudnn_exhaustive_search=true
# export FLAGS_conv_workspace_size_limit=4000
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--device ${device} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} --output exp/${ckpt_name} \
--seed ${seed}
if [ ${seed} != 0 ]; then
#unset FLAGS_cudnn_deterministic
echo "None"
fi
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in training!" echo "Failed in training!"
......
...@@ -4,6 +4,7 @@ data: ...@@ -4,6 +4,7 @@ data:
dev_manifest: data/manifest.tiny dev_manifest: data/manifest.tiny
test_manifest: data/manifest.tiny test_manifest: data/manifest.tiny
mean_std_filepath: data/mean_std.json mean_std_filepath: data/mean_std.json
unit_type: char
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 4 batch_size: 4
...@@ -35,6 +36,8 @@ model: ...@@ -35,6 +36,8 @@ model:
rnn_layer_size: 2048 rnn_layer_size: 2048
use_gru: False use_gru: False
share_rnn_weights: True share_rnn_weights: True
blank_id: 0
ctc_grad_norm_type: instance
training: training:
n_epoch: 20 n_epoch: 20
......
# https://yaml.org/type/float.html
data:
train_manifest: data/manifest.tiny
dev_manifest: data/manifest.tiny
test_manifest: data/manifest.tiny
min_input_len: 0.0
max_input_len: 30.0
min_output_len: 0.0
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
collator:
mean_std_filepath: data/mean_std.json
unit_type: char
vocab_filepath: data/vocab.txt
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
spectrum_type: linear
feat_dim:
delta_delta: False
stride_ms: 10.0
window_ms: 20.0
n_fft: None
max_freq: None
target_sample_rate: 16000
use_dB_normalization: True
target_dB: -20
dither: 1.0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 0
batch_size: 4
model:
num_conv_layers: 2
num_rnn_layers: 4
rnn_layer_size: 2048
rnn_direction: forward
num_fc_layers: 2
fc_layers_size_list: 512, 256
use_gru: True
blank_id: 0
ctc_grad_norm_type: instance
training:
n_epoch: 10
accum_grad: 1
lr: 1e-5
lr_decay: 1.0
weight_decay: 1e-06
global_grad_clip: 5.0
log_interval: 1
checkpoint:
kbest_n: 3
latest_n: 2
decoding:
batch_size: 128
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8
#! /usr/bin/env bash #!/bin/bash
. ${MAIN_ROOT}/utils/utility.sh . ${MAIN_ROOT}/utils/utility.sh
...@@ -9,6 +9,11 @@ URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm ...@@ -9,6 +9,11 @@ URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm
MD5="099a601759d467cd0a8523ff939819c5" MD5="099a601759d467cd0a8523ff939819c5"
TARGET=${DIR}/common_crawl_00.prune01111.trie.klm TARGET=${DIR}/common_crawl_00.prune01111.trie.klm
if [ -e $TARGET ];then
echo "$TARGET exists."
exit 0
fi
echo "Download language model ..." echo "Download language model ..."
download $URL $MD5 $TARGET download $URL $MD5 $TARGET
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
......
#! /usr/bin/env bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 4 ];then
echo "usage: $0 config_path ckpt_prefix jit_model_path" echo "usage: $0 config_path ckpt_prefix jit_model_path model_type"
exit -1 exit -1
fi fi
...@@ -11,19 +11,14 @@ echo "using $ngpu gpus..." ...@@ -11,19 +11,14 @@ echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_path_prefix=$2 ckpt_path_prefix=$2
jit_model_export_path=$3 jit_model_export_path=$3
model_type=$4
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
python3 -u ${BIN_DIR}/export.py \ python3 -u ${BIN_DIR}/export.py \
--device ${device} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--checkpoint_path ${ckpt_path_prefix} \ --checkpoint_path ${ckpt_path_prefix} \
--export_path ${jit_model_export_path} --export_path ${jit_model_export_path} \
--model_type ${model_type}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in export!" echo "Failed in export!"
......
#! /usr/bin/env bash #!/bin/bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path ckpt_path_prefix model_type"
exit -1 exit -1
fi fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
config_path=$1 config_path=$1
ckpt_prefix=$2 ckpt_prefix=$2
model_type=$3
# download language model # download language model
bash local/download_lm_en.sh bash local/download_lm_en.sh
...@@ -22,11 +19,11 @@ if [ $? -ne 0 ]; then ...@@ -22,11 +19,11 @@ if [ $? -ne 0 ]; then
fi fi
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--device ${device} \ --nproc ${ngpu} \
--nproc 1 \
--config ${config_path} \ --config ${config_path} \
--result_file ${ckpt_prefix}.rsl \ --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} --checkpoint_path ${ckpt_prefix} \
--model_type ${model_type}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
......
#! /usr/bin/env bash #!/bin/bash
if [ $# != 2 ];then profiler_options=
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
exit -1 # seed may break model convergence
fi seed=0
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 if [ ${seed} != 0 ]; then
ckpt_name=$2 export FLAGS_cudnn_deterministic=True
echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
fi
device=gpu if [ $# != 3 ];then
if [ ngpu == 0 ];then echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name model_type"
device=cpu exit -1
fi fi
config_path=$1
ckpt_name=$2
model_type=$3
mkdir -p exp mkdir -p exp
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--device ${device} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} --output exp/${ckpt_name} \
--model_type ${model_type} \
--profiler-options "${profiler_options}" \
--seed ${seed}
if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic
fi
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in training!" echo "Failed in training!"
......
export MAIN_ROOT=${PWD}/../../../ export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C export LC_ALL=C
......
...@@ -7,11 +7,12 @@ stage=0 ...@@ -7,11 +7,12 @@ stage=0
stop_stage=100 stop_stage=100
conf_path=conf/deepspeech2.yaml conf_path=conf/deepspeech2.yaml
avg_num=1 avg_num=1
model_type=offline
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
avg_ckpt=avg_${avg_num} avg_ckpt=avg_${avg_num}
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') ###ckpt = deepspeech2
echo "checkpoint name ${ckpt}" echo "checkpoint name ${ckpt}"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
...@@ -21,20 +22,20 @@ fi ...@@ -21,20 +22,20 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir # train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${model_type}
fi fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model # avg n best model
./local/avg.sh exp/${ckpt}/checkpoints ${avg_num} avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# export ckpt avg_n # export ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}
fi fi
...@@ -65,6 +65,8 @@ model: ...@@ -65,6 +65,8 @@ model:
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
......
#!/bin/bash
if [ $# != 2 ];then
echo "usage: ${0} config_path ckpt_path_prefix"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
ckpt_prefix=$2
batch_size=1
output_dir=${ckpt_prefix}
mkdir -p ${output_dir}
# align dump in `result_file`
# .tier, .TextGrid dump in `dir of result_file`
python3 -u ${BIN_DIR}/alignment.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${output_dir}/${type}.align \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in ctc alignment!"
exit 1
fi
exit 0
#! /usr/bin/env bash #!/bin/bash
stage=-1 stage=-1
stop_stage=100 stop_stage=100
......
#! /usr/bin/env bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 3 ];then
echo "usage: $0 config_path ckpt_prefix jit_model_path" echo "usage: $0 config_path ckpt_prefix jit_model_path"
...@@ -12,13 +12,7 @@ config_path=$1 ...@@ -12,13 +12,7 @@ config_path=$1
ckpt_path_prefix=$2 ckpt_path_prefix=$2
jit_model_export_path=$3 jit_model_export_path=$3
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
python3 -u ${BIN_DIR}/export.py \ python3 -u ${BIN_DIR}/export.py \
--device ${device} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--checkpoint_path ${ckpt_path_prefix} \ --checkpoint_path ${ckpt_path_prefix} \
......
#! /usr/bin/env bash #!/bin/bash
if [ $# != 2 ];then if [ $# != 2 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path ckpt_path_prefix"
...@@ -8,30 +8,57 @@ fi ...@@ -8,30 +8,57 @@ fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
config_path=$1 config_path=$1
ckpt_prefix=$2 ckpt_prefix=$2
chunk_mode=false
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
chunk_mode=true
fi
# download language model # download language model
#bash local/download_lm_en.sh #bash local/download_lm_en.sh
#if [ $? -ne 0 ]; then #if [ $? -ne 0 ]; then
# exit 1 # exit 1
#fi #fi
python3 -u ${BIN_DIR}/test.py \ for type in attention ctc_greedy_search; do
--device ${device} \ echo "decoding ${type}"
--nproc 1 \ if [ ${chunk_mode} == true ];then
--config ${config_path} \ # stream decoding only support batchsize=1
--result_file ${ckpt_prefix}.rsl \ batch_size=1
--checkpoint_path ${ckpt_prefix} else
batch_size=64
fi
python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \
--opts decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
exit 1 exit 1
fi fi
done
for type in ctc_prefix_beam_search attention_rescoring; do
echo "decoding ${type}"
batch_size=1
python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \
--opts decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
done
exit 0 exit 0
#! /usr/bin/env bash #!/bin/bash
profiler_options=
benchmark_batch_size=0
benchmark_max_step=0
# seed may break model convergence
seed=0
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True
echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
fi
if [ $# != 2 ];then if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
exit -1 exit -1
fi fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_name=$2 ckpt_name=$2
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
mkdir -p exp mkdir -p exp
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--device ${device} \ --seed ${seed} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} --output exp/${ckpt_name} \
--profiler-options "${profiler_options}" \
--benchmark-batch-size ${benchmark_batch_size} \
--benchmark-max-step ${benchmark_max_step}
if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic
fi
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in training!" echo "Failed in training!"
......
...@@ -20,20 +20,26 @@ fi ...@@ -20,20 +20,26 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir # train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=4,5,6,7 ./local/train.sh ${conf_path} ${ckpt} ./local/train.sh ${conf_path} ${ckpt}
fi fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model # avg n best model
./local/avg.sh exp/${ckpt}/checkpoints ${avg_num} avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES= ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# ctc alignment of test data
CUDA_VISIBLE_DEVICES= ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n # export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi fi
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册