提交 23bc2d7b 编写于 作者: G GuoShen

Initial commit

上级
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N802" />
</list>
</option>
</inspection_tool>
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (pytorchEnv)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/test_asr.iml" filepath="$PROJECT_DIR$/.idea/test_asr.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.6 (pytorchEnv)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
# 开发人员:郭深
# 开发时间:2022/10/7 16:10
from config.conf import num_workers, pickle_file, IGNORE_ID, input_dim
\ No newline at end of file
# 开发人员:郭深
# 开发时间:2022/10/7 16:11
import os
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # sets device for model and PyTorch tensors
# Model parameters
input_dim = 80 # dimension of feature
window_size = 25 # window size for FFT (ms)
stride = 10 # window stride for FFT (ms)
hidden_size = 512
embedding_dim = 512
cmvn = True # apply CMVN on feature
num_layers = 4
LFR_m = 4
LFR_n = 3
# Training parameters
n_jobs = 8
decode_beam_size = 20
tf_rate = 1.0
lr = 1e-3
num_workers = 1 # for data-loading; right now, only 1 works with h5py
grad_clip = 5. # clip gradients at an absolute value of
print_freq = 100 # print training/validation stats every __ batches
checkpoint = None # path to checkpoint, None if none
# Data parameters
IGNORE_ID = -1
sos_id = 0
eos_id = 1
num_train = 120098
num_dev = 14326
num_test = 7176
vocab_size = 4336
DATA_DIR = 'data'
aishell_folder = r'E:\asrdataset\model\Listen-Attend-Spell\Listen-Attend-Spell\data\data_aishell'
wav_folder = os.path.join(aishell_folder, 'wav')
tran_file = os.path.join(aishell_folder, r'transcript\aishell_transcript_v0.8.txt')
pickle_file = r'E:\asrdataset\model\Listen-Attend-Spell\Listen-Attend-Spell\data\aishell.pickle'
# 开发人员:郭深
# 开发时间:2022/10/7 16:03
# 开发人员:郭深
# 开发时间:2022/10/7 16:03
import pickle
import random
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import default_collate
from .data_process import build_LFR_features
from config.conf import num_workers, pickle_file, IGNORE_ID, input_dim
from features.feature_generate import extract_feature
def pad_collate(batch):
max_input_len = float('-inf')
max_target_len = float('-inf')
for elem in batch:
feature, trn = elem
max_input_len = max_input_len if max_input_len > feature.shape[0] else feature.shape[0]
max_target_len = max_target_len if max_target_len > len(trn) else len(trn)
for i, elem in enumerate(batch):
feature, trn = elem
input_length = feature.shape[0]
input_dim = feature.shape[1]
padded_input = np.zeros((max_input_len, input_dim), dtype=np.float32)
padded_input[:input_length, :] = feature
padded_target = np.pad(trn, (0, max_target_len - len(trn)), 'constant', constant_values=IGNORE_ID)
batch[i] = (padded_input, padded_target, input_length)
# sort it by input lengths (long to short)
batch.sort(key=lambda x: x[2], reverse=True)
return default_collate(batch)
# Source: https://www.kaggle.com/davids1992/specaugment-quick-implementation
def spec_augment(spec: np.ndarray,
num_mask=2,
freq_masking=0.15,
time_masking=0.20,
value=0):
spec = spec.copy()
num_mask = random.randint(1, num_mask)
for i in range(num_mask):
all_freqs_num, all_frames_num = spec.shape
freq_percentage = random.uniform(0.0, freq_masking)
num_freqs_to_mask = int(freq_percentage * all_freqs_num)
f0 = np.random.uniform(low=0.0, high=all_freqs_num - num_freqs_to_mask)
f0 = int(f0)
spec[f0:f0 + num_freqs_to_mask, :] = value
time_percentage = random.uniform(0.0, time_masking)
num_frames_to_mask = int(time_percentage * all_frames_num)
t0 = np.random.uniform(low=0.0, high=all_frames_num - num_frames_to_mask)
t0 = int(t0)
spec[:, t0:t0 + num_frames_to_mask] = value
return spec
class AiShellDataset(Dataset):
def __init__(self, args, split):
with open(pickle_file, 'rb') as file:
data = pickle.load(file)
self.samples = data[split]
self.args = args
print('loading {} {} samples...'.format(len(self.samples), split))
def __getitem__(self, i):
sample = self.samples[i]
wave = sample['wave']
trn = sample['trn']
feature = extract_feature(input_file=wave, feature='fbank', dim=input_dim, cmvn=True)
feature = build_LFR_features(feature, m=self.args.LFR_m, n=self.args.LFR_n)
# zero mean and unit variance
feature = (feature - feature.mean()) / feature.std()
feature = spec_augment(feature)
return feature, trn
def __len__(self):
return len(self.samples)
import os
import pickle
import numpy as np
from tqdm import tqdm
from config.conf import wav_folder, tran_file, pickle_file
VOCAB = {'<sos>': 0, '<eos>': 1}
IVOCAB = {0: '<sos>', 1: '<eos>'}
def build_LFR_features(inputs, m, n):
"""
Actually, this implements stacking frames and skipping frames.
if m = 1 and n = 1, just return the origin features.
if m = 1 and n > 1, it works like skipping.
if m > 1 and n = 1, it works like stacking but only support right frames.
if m > 1 and n > 1, it works like LFR.
Args:
inputs_batch: inputs is T x D np.ndarray
m: number of frames to stack
n: number of frames to skip
"""
# LFR_inputs_batch = []
# for inputs in inputs_batch:
LFR_inputs = []
T = inputs.shape[0]
T_lfr = int(np.ceil(T / n))
for i in range(T_lfr):
if m <= T - i * n:
LFR_inputs.append(np.hstack(inputs[i * n:i * n + m]))
else: # process last LFR frame
num_padding = m - (T - i * n)
frame = np.hstack(inputs[i * n:])
for _ in range(num_padding):
frame = np.hstack((frame, inputs[-1]))
LFR_inputs.append(frame)
return np.vstack(LFR_inputs)
def ensure_folder(folder):
import os
if not os.path.isdir(folder):
os.mkdir(folder)
def get_data(split):
print('getting {} data...'.format(split))
global VOCAB
with open(tran_file, 'r', encoding='utf-8') as file:
lines = file.readlines()
tran_dict = dict()
for line in lines:
tokens = line.split()
key = tokens[0]
trn = ''.join(tokens[1:])
tran_dict[key] = trn
samples = []
folder = os.path.join(wav_folder, split)
ensure_folder(folder)
dirs = [os.path.join(folder, d) for d in os.listdir(folder) if os.path.isdir(os.path.join(folder, d))]
for dir in tqdm(dirs):
files = [f for f in os.listdir(dir) if f.endswith('.wav')]
for f in files:
wave = os.path.join(dir, f)
key = f.split('.')[0]
if key in tran_dict:
trn = tran_dict[key]
trn = list(trn.strip()) + ['<eos>']
for token in trn:
build_vocab(token)
trn = [VOCAB[token] for token in trn]
samples.append({'trn': trn, 'wave': wave})
print('split: {}, num_files: {}'.format(split, len(samples)))
return samples
def build_vocab(token):
global VOCAB, IVOCAB
if not token in VOCAB:
next_index = len(VOCAB)
VOCAB[token] = next_index
IVOCAB[next_index] = token
def data_pre():
data = dict()
data['VOCAB'] = VOCAB
data['IVOCAB'] = IVOCAB
data['train'] = get_data('train')
data['dev'] = get_data('dev')
data['test'] = get_data('test')
with open(pickle_file, 'wb') as file:
print(data)
pickle.dump(data, file)
print('num_train: ' + str(len(data['train'])))
print('num_dev: ' + str(len(data['dev'])))
print('num_test: ' + str(len(data['test'])))
print('vocab_size: ' + str(len(data['VOCAB'])))
if __name__ == "__main__":
data_pre()
# 开发人员:郭深
# 开发时间:2022/10/8 19:16
# 开发人员:郭深
# 开发时间:2022/10/8 19:17
import librosa
import numpy as np
def normalize(yt):
yt_max = np.max(yt)
yt_min = np.min(yt)
a = 1.0 / (yt_max - yt_min)
b = -(yt_max + yt_min) / (2 * (yt_max - yt_min))
yt = yt * a + b
return yt
def extract_feature(input_file, feature='fbank', dim=80, cmvn=True, delta=False, delta_delta=False,
window_size=25, stride=10, save_feature=None):
y, sr = librosa.load(input_file, sr=None)
yt, _ = librosa.effects.trim(y, top_db=20)
yt = normalize(yt)
ws = int(sr * 0.001 * window_size)
st = int(sr * 0.001 * stride)
if feature == 'fbank': # log-scaled
feat = librosa.feature.melspectrogram(y=yt, sr=sr, n_mels=dim,
n_fft=ws, hop_length=st)
feat = np.log(feat + 1e-6)
elif feature == 'mfcc':
feat = librosa.feature.mfcc(y=yt, sr=sr, n_mfcc=dim, n_mels=26,
n_fft=ws, hop_length=st)
feat[0] = librosa.feature.rmse(yt, hop_length=st, frame_length=ws)
else:
raise ValueError('Unsupported Acoustic Feature: ' + feature)
feat = [feat]
if delta:
feat.append(librosa.feature.delta(feat[0]))
if delta_delta:
feat.append(librosa.feature.delta(feat[0], order=2))
feat = np.concatenate(feat, axis=0)
if cmvn:
feat = (feat - feat.mean(axis=1)[:, np.newaxis]) / (feat.std(axis=1) + 1e-16)[:, np.newaxis]
if save_feature is not None:
tmp = np.swapaxes(feat, 0, 1).astype('float32')
np.save(save_feature, tmp)
return len(tmp)
else:
return np.swapaxes(feat, 0, 1).astype('float32')
\ No newline at end of file
# 开发人员:郭深
# 开发时间:2022/10/8 19:45
import torch
import torch.nn as nn
import torch.nn.functional as F
class DotProductAttention(nn.Module):
r"""Dot product attention.
Given a set of vector values, and a vector query, attention is a technique
to compute a weighted sum of the values, dependent on the query.
NOTE: Here we use the terminology in Stanford cs224n-2018-lecture11.
"""
def __init__(self):
super(DotProductAttention, self).__init__()
# TODO: move this out of this class?
# self.linear_out = nn.Linear(dim*2, dim)
def forward(self, queries, values):
"""
Args:
queries: N x To x H
values : N x Ti x H
Returns:
output: N x To x H
attention_distribution: N x To x Ti
"""
batch_size = queries.size(0)
hidden_size = queries.size(2)
input_lengths = values.size(1)
# (N, To, H) * (N, H, Ti) -> (N, To, Ti)
attention_scores = torch.bmm(queries, values.transpose(1, 2))
attention_distribution = F.softmax(
attention_scores.view(-1, input_lengths), dim=1).view(batch_size, -1, input_lengths)
# (N, To, Ti) * (N, Ti, H) -> (N, To, H)
attention_output = torch.bmm(attention_distribution, values)
# # concat -> (N, To, 2*H)
# concated = torch.cat((attention_output, queries), dim=2)
# # TODO: Move this out of this class?
# # output -> (N, To, H)
# output = torch.tanh(self.linear_out(
# concated.view(-1, 2*hidden_size))).view(batch_size, -1, hidden_size)
return attention_output, attention_distribution
import torch
import torch.nn as nn
import torch.nn.functional as F
from config.conf import IGNORE_ID, vocab_size, sos_id, eos_id
from utils.util import pad_list
from .attention import DotProductAttention
class Decoder(nn.Module):
"""
"""
def __init__(self, vocab_size=vocab_size, embedding_dim=512, sos_id=sos_id, eos_id=eos_id, hidden_size=512,
num_layers=1, bidirectional_encoder=True):
super(Decoder, self).__init__()
# Hyper parameters
# embedding + output
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.sos_id = sos_id # Start of Sentence
self.eos_id = eos_id # End of Sentence
# rnn
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bidirectional_encoder = bidirectional_encoder # useless now
self.encoder_hidden_size = hidden_size # must be equal now
# Components
self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim) # 将每个词编码成d维向量
self.rnn = nn.ModuleList()
self.rnn += [nn.LSTMCell(self.embedding_dim +
self.encoder_hidden_size, self.hidden_size)]
for l in range(1, self.num_layers):
self.rnn += [nn.LSTMCell(self.hidden_size, self.hidden_size)]
self.attention = DotProductAttention() # 点乘注意力机制
self.mlp = nn.Sequential(
nn.Linear(self.encoder_hidden_size + self.hidden_size,
self.hidden_size),
nn.Tanh(),
nn.Linear(self.hidden_size, self.vocab_size))
def zero_state(self, encoder_padded_outputs, H=None):
N = encoder_padded_outputs.size(0)
H = self.hidden_size if H == None else H
return encoder_padded_outputs.new_zeros(N, H)
def forward(self, padded_input, encoder_padded_outputs):
"""
Args:
padded_input: N x To
# encoder_hidden: (num_layers * num_directions) x N x H
encoder_padded_outputs: N x Ti x H
Returns:
"""
# *********Get Input and Output
# from espnet/Decoder.forward()
# TODO: need to make more smart way
ys = [y[y != IGNORE_ID] for y in padded_input] # parse padded ys
# prepare input and output word sequences with sos/eos IDs
eos = ys[0].new([self.eos_id])
sos = ys[0].new([self.sos_id])
ys_in = [torch.cat([sos, y], dim=0) for y in ys]
ys_out = [torch.cat([y, eos], dim=0) for y in ys]
# padding for ys with -1
# pys: utt x olen
ys_in_pad = pad_list(ys_in, self.eos_id)
ys_out_pad = pad_list(ys_out, IGNORE_ID)
# print("ys_in_pad", ys_in_pad.size())
assert ys_in_pad.size() == ys_out_pad.size()
batch_size = ys_in_pad.size(0)
output_length = ys_in_pad.size(1)
# max_length = ys_in_pad.size(1) - 1 # TODO: should minus 1(sos)?
# *********Init decoder rnn
h_list = [self.zero_state(encoder_padded_outputs)]
c_list = [self.zero_state(encoder_padded_outputs)]
for l in range(1, self.num_layers):
h_list.append(self.zero_state(encoder_padded_outputs))
c_list.append(self.zero_state(encoder_padded_outputs))
att_c = self.zero_state(encoder_padded_outputs,
H=encoder_padded_outputs.size(2))
y_all = []
# **********LAS: 1. decoder rnn 2. attention 3. concate and MLP
embedded = self.embedding(ys_in_pad)
for t in range(output_length):
# step 1. decoder RNN: s_i = RNN(s_i−1,y_i−1,c_i−1)
rnn_input = torch.cat((embedded[:, t, :], att_c), dim=1)
h_list[0], c_list[0] = self.rnn[0](
rnn_input, (h_list[0], c_list[0]))
for l in range(1, self.num_layers):
h_list[l], c_list[l] = self.rnn[l](
h_list[l - 1], (h_list[l], c_list[l]))
rnn_output = h_list[-1] # below unsqueeze: (N x H) -> (N x 1 x H)
# step 2. attention: c_i = AttentionContext(s_i,h)
att_c, att_w = self.attention(rnn_output.unsqueeze(dim=1),
encoder_padded_outputs)
att_c = att_c.squeeze(dim=1)
# step 3. concate s_i and c_i, and input to MLP
mlp_input = torch.cat((rnn_output, att_c), dim=1)
predicted_y_t = self.mlp(mlp_input)
y_all.append(predicted_y_t)
y_all = torch.stack(y_all, dim=1) # N x To x C
# **********Cross Entropy Loss
# F.cross_entropy = NLL(log_softmax(input), target))
y_all = y_all.view(batch_size * output_length, self.vocab_size)
ce_loss = F.cross_entropy(y_all, ys_out_pad.view(-1),
ignore_index=IGNORE_ID,
reduction='mean')
# TODO: should minus 1 here ?
# ce_loss *= (np.mean([len(y) for y in ys_in]) - 1)
# print("ys_in\n", ys_in)
# temp = [len(x) for x in ys_in]
# print(temp)
# print(np.mean(temp) - 1)
return ce_loss
# *********step decode
# decoder_outputs = []
# sequence_symbols = []
# lengths = np.array([max_length] * batch_size)
# def decode(step, step_output, step_attn):
# # step_output is log_softmax()
# decoder_outputs.append(step_output)
# symbols = decoder_outputs[-1].topk(1)[1]
# sequence_symbols.append(symbols)
# #
# eos_batches = symbols.data.eq(self.eos_id)
# if eos_batches.dim() > 0:
# eos_batches = eos_batches.cpu().view(-1).numpy()
# update_idx = ((step < lengths) & eos_batches) != 0
# lengths[update_idx] = len(sequence_symbols)
# return symbols
# # *********Run each component
# decoder_input = ys_in_pad
# embedded = self.embedding(decoder_input)
# rnn_output, decoder_hidden = self.rnn(embedded) # use zero state
# output, attn = self.attention(rnn_output, encoder_padded_outputs)
# output = output.contiguous().view(-1, self.hidden_size)
# predicted_softmax = F.log_softmax(self.out(output), dim=1).view(
# batch_size, output_length, -1)
# for t in range(predicted_softmax.size(1)):
# step_output = predicted_softmax[:, t, :]
# step_attn = attn[:, t, :]
# decode(t, step_output, step_attn)
def recognize_beam(self, encoder_outputs, char_list, args):
"""Beam search, decode one utterence now.
Args:
encoder_outputs: T x H
char_list: list of character
args: args.beam
Returns:
nbest_hyps:
"""
# search params
beam = args.beam_size
nbest = args.nbest
if args.decode_max_len == 0:
maxlen = encoder_outputs.size(0)
else:
maxlen = args.decode_max_len
# *********Init decoder rnn
h_list = [self.zero_state(encoder_outputs.unsqueeze(0))]
c_list = [self.zero_state(encoder_outputs.unsqueeze(0))]
for l in range(1, self.num_layers):
h_list.append(self.zero_state(encoder_outputs.unsqueeze(0)))
c_list.append(self.zero_state(encoder_outputs.unsqueeze(0)))
att_c = self.zero_state(encoder_outputs.unsqueeze(0),
H=encoder_outputs.unsqueeze(0).size(2))
# prepare sos
y = self.sos_id
vy = encoder_outputs.new_zeros(1).long()
hyp = {'score': 0.0, 'yseq': [y], 'c_prev': c_list, 'h_prev': h_list,
'a_prev': att_c}
hyps = [hyp]
ended_hyps = []
for i in range(maxlen):
hyps_best_kept = []
for hyp in hyps:
# vy.unsqueeze(1)
vy[0] = hyp['yseq'][i]
embedded = self.embedding(vy)
# embedded.unsqueeze(0)
# step 1. decoder RNN: s_i = RNN(s_i−1,y_i−1,c_i−1)
rnn_input = torch.cat((embedded, hyp['a_prev']), dim=1)
h_list[0], c_list[0] = self.rnn[0](
rnn_input, (hyp['h_prev'][0], hyp['c_prev'][0]))
for l in range(1, self.num_layers):
h_list[l], c_list[l] = self.rnn[l](
h_list[l - 1], (hyp['h_prev'][l], hyp['c_prev'][l]))
rnn_output = h_list[-1]
# step 2. attention: c_i = AttentionContext(s_i,h)
# below unsqueeze: (N x H) -> (N x 1 x H)
att_c, att_w = self.attention(rnn_output.unsqueeze(dim=1),
encoder_outputs.unsqueeze(0))
att_c = att_c.squeeze(dim=1)
# step 3. concate s_i and c_i, and input to MLP
mlp_input = torch.cat((rnn_output, att_c), dim=1)
predicted_y_t = self.mlp(mlp_input)
local_scores = F.log_softmax(predicted_y_t, dim=1)
# topk scores
local_best_scores, local_best_ids = torch.topk(
local_scores, beam, dim=1)
for j in range(beam):
new_hyp = {}
new_hyp['h_prev'] = h_list[:]
new_hyp['c_prev'] = c_list[:]
new_hyp['a_prev'] = att_c[:]
new_hyp['score'] = hyp['score'] + local_best_scores[0, j]
new_hyp['yseq'] = [0] * (1 + len(hyp['yseq']))
new_hyp['yseq'][:len(hyp['yseq'])] = hyp['yseq']
new_hyp['yseq'][len(hyp['yseq'])] = int(
local_best_ids[0, j])
# will be (2 x beam) hyps at most
hyps_best_kept.append(new_hyp)
hyps_best_kept = sorted(hyps_best_kept,
key=lambda x: x['score'],
reverse=True)[:beam]
# end for hyp in hyps
hyps = hyps_best_kept
# add eos in the final loop to avoid that there are no ended hyps
if i == maxlen - 1:
for hyp in hyps:
hyp['yseq'].append(self.eos_id)
# add ended hypothes to a final list, and removed them from current hypothes
# (this will be a probmlem, number of hyps < beam)
remained_hyps = []
for hyp in hyps:
if hyp['yseq'][-1] == self.eos_id:
# hyp['score'] += (i + 1) * penalty
ended_hyps.append(hyp)
else:
remained_hyps.append(hyp)
hyps = remained_hyps
# if len(hyps) > 0:
# print('remeined hypothes: ' + str(len(hyps)))
# else:
# print('no hypothesis. Finish decoding.')
# break
#
# for hyp in hyps:
# print('hypo: ' + ''.join([char_list[int(x)]
# for x in hyp['yseq'][1:]]))
# end for i in range(maxlen)
nbest_hyps = sorted(ended_hyps, key=lambda x: x['score'], reverse=True)[
:min(len(ended_hyps), nbest)]
return nbest_hyps
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
class Encoder(nn.Module):
r"""Applies a multi-layer LSTM to an variable length input sequence.
"""
def __init__(self, input_size=320, hidden_size=256, num_layers=3,
dropout=0.0, bidirectional=True, rnn_type='lstm'):
super(Encoder, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bidirectional = bidirectional
self.rnn_type = rnn_type
self.dropout = dropout
self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
batch_first=True,
dropout=dropout,
bidirectional=bidirectional)
def forward(self, padded_input, input_lengths):
"""
Args:
padded_input: N x T x D
input_lengths: N
Returns: output, hidden
- **output**: N x T x H
- **hidden**: (num_layers * num_directions) x N x H
"""
# Add total_length for supportting nn.DataParallel() later
# see https://pytorch.org/docs/stable/notes/faq.html#pack-rnn-unpack-with-data-parallelism
total_length = padded_input.size(1) # get the max sequence length
# packed_input = pack_padded_sequence(padded_input, input_lengths, batch_first=True)
packed_input = pack_padded_sequence(padded_input, input_lengths, batch_first=True)
packed_output, hidden = self.lstm(packed_input)
output, _ = pad_packed_sequence(packed_output, batch_first=True, total_length=total_length)
return output, hidden
def flatten_parameters(self):
self.rnn.flatten_parameters()
class LasOptimizer(object):
"""A simple wrapper class for learning rate scheduling"""
def __init__(self, optimizer, warmup_steps=4000, k=0.2):
self.optimizer = optimizer
self.k = k
self.warmup_steps = warmup_steps
d_model = 512
self.init_lr = d_model ** (-0.5)
self.lr = self.init_lr
self.warmup_steps = warmup_steps
self.k = k
self.step_num = 0
def zero_grad(self):
self.optimizer.zero_grad()
def step(self):
self._update_lr()
self.optimizer.step()
def _update_lr(self):
self.step_num += 1
self.lr = self.k * self.init_lr * min(self.step_num ** (-0.5),
self.step_num * (self.warmup_steps ** (-1.5)))
for param_group in self.optimizer.param_groups:
param_group['lr'] = self.lr
import torch.nn as nn
from .decoder import Decoder
from .encoder import Encoder
class Seq2Seq(nn.Module):
"""Sequence-to-Sequence architecture with configurable encoder and decoder.
"""
def __init__(self, encoder=None, decoder=None):
super(Seq2Seq, self).__init__()
if encoder is not None and decoder is not None:
self.encoder = encoder
self.decoder = decoder
else:
self.encoder = Encoder()
self.decoder = Decoder()
def forward(self, padded_input, input_lengths, padded_target):
"""
Args:
padded_input: N x Ti x D
input_lengths: N
padded_targets: N x To
"""
encoder_padded_outputs, _ = self.encoder(padded_input, input_lengths)
loss = self.decoder(padded_target, encoder_padded_outputs)
return loss
def recognize(self, input, input_length, char_list, args):
"""Sequence-to-Sequence beam search, decode one utterence now.
Args:
input: T x D
char_list: list of characters
args: args.beam
Returns:
nbest_hyps:
"""
encoder_outputs, _ = self.encoder(input.unsqueeze(0), input_length)
nbest_hyps = self.decoder.recognize_beam(encoder_outputs[0],
char_list,
args)
return nbest_hyps
import argparse
import pickle
import torch
from tqdm import tqdm
from config.conf import pickle_file, device, input_dim, LFR_m, LFR_n, sos_id, eos_id
from data.data_process import build_LFR_features
from features.feature_generate import extract_feature
from utils.util import cer_function
parser = argparse.ArgumentParser(
"End-to-End Automatic Speech Recognition Decoding.")
# decode
parser.add_argument('--beam_size', default=5, type=int,
help='Beam size')
parser.add_argument('--nbest', default=1, type=int,
help='Nbest size')
parser.add_argument('--decode_max_len', default=100, type=int,
help='Max output length. If ==0 (default), it uses a '
'end-detect function to automatically find maximum '
'hypothesis lengths')
if __name__ == '__main__':
args = parser.parse_args()
with open(pickle_file, 'rb') as file:
data = pickle.load(file)
char_list = data['IVOCAB']
samples = data['test']
checkpoint = 'BEST_checkpoint.tar'
checkpoint = torch.load(checkpoint, map_location='cpu')
model = checkpoint['model'].to(device)
model.eval()
num_samples = len(samples)
total_cer = 0
for i in tqdm(range(num_samples)):
sample = samples[i]
wave = sample['wave']
trn = sample['trn']
feature = extract_feature(input_file=wave, feature='fbank', dim=input_dim, cmvn=True)
feature = build_LFR_features(feature, m=LFR_m, n=LFR_n)
# feature = np.expand_dims(feature, axis=0)
input = torch.from_numpy(feature).to(device)
input_length = [input.shape[0]]
input_length = torch.LongTensor(input_length).to(device)
with torch.no_grad():
nbest_hyps = model.recognize(input, input_length, char_list, args)
hyp_list = []
for hyp in nbest_hyps:
out = hyp['yseq']
out = [char_list[idx] for idx in out if idx not in (sos_id, eos_id)]
out = ''.join(out)
hyp_list.append(out)
print(hyp_list)
gt = [char_list[idx] for idx in trn if idx not in (sos_id, eos_id)]
gt = ''.join(gt)
gt_list = [gt]
print(gt_list)
cer = cer_function(gt_list, hyp_list)
total_cer += cer
avg_cer = total_cer / num_samples
print('Average CER: ' + str(avg_cer))
# 开发人员:郭深
# 开发时间:2022/10/7 16:12
import numpy as np
import torch
import argparse
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from models.LAS.decoder import Decoder
from models.LAS.encoder import Encoder
from models.LAS.optimizer import LasOptimizer
from models.LAS.seq2seq import Seq2Seq
from data.data_load import AiShellDataset, pad_collate
from config.conf import device, print_freq, vocab_size, num_workers, sos_id, eos_id
from utils.util import get_logger, save_checkpoint, AverageMeter
parser = argparse.ArgumentParser(
"End-to-End Automatic Speech Recognition Training "
"(Transformer framework).")
# Low Frame Rate (stacking and skipping frames)
parser.add_argument('--LFR_m', default=4, type=int,
help='Low Frame Rate: number of frames to stack')
parser.add_argument('--LFR_n', default=3, type=int,
help='Low Frame Rate: number of frames to skip')
# general
# Network architecture
# encoder
# TODO: automatically infer input dim
parser.add_argument('--einput', default=80, type=int,
help='Dim of encoder input')
parser.add_argument('--ehidden', default=256, type=int,
help='Size of encoder hidden units')
parser.add_argument('--elayer', default=3, type=int,
help='Number of encoder layers.')
parser.add_argument('--edropout', default=0.2, type=float,
help='Encoder dropout rate')
parser.add_argument('--ebidirectional', default=True, type=bool,
help='Whether use bidirectional encoder')
parser.add_argument('--etype', default='lstm', type=str,
help='Type of encoder RNN')
# attention
parser.add_argument('--atype', default='dot', type=str,
help='Type of attention (Only support Dot Product now)')
# decoder
parser.add_argument('--dembed', default=512, type=int,
help='Size of decoder embedding')
parser.add_argument('--dhidden', default=512, type=int,
help='Size of decoder hidden units. Should be encoder '
'(2*) hidden size dependding on bidirection')
parser.add_argument('--dlayer', default=1, type=int,
help='Number of decoder layers.')
# Training config
parser.add_argument('--epochs', default=150, type=int,
help='Number of maximum epochs')
parser.add_argument('--half_lr', dest='half_lr', default=True, type=bool,
help='Halving learning rate when get small improvement')
parser.add_argument('--early_stop', dest='early_stop', default=0, type=int,
help='Early stop training when halving lr but still get'
'small improvement')
parser.add_argument('--max_norm', default=5, type=float,
help='Gradient norm threshold to clip')
# minibatch
parser.add_argument('--batch-size', '-b', default=32, type=int,
help='Batch size')
parser.add_argument('--maxlen_in', default=800, type=int, metavar='ML',
help='Batch size is reduced if the input sequence length > ML')
parser.add_argument('--maxlen_out', default=150, type=int, metavar='ML',
help='Batch size is reduced if the output sequence length > ML')
parser.add_argument('--num_workers', default=4, type=int,
help='Number of workers to generate minibatch')
# optimizer
parser.add_argument('--optimizer', default='adam', type=str,
choices=['sgd', 'adam'],
help='Optimizer (support sgd and adam now)')
parser.add_argument('--lr', default=1e-2, type=float,
help='Init learning rate')
parser.add_argument('--momentum', default=0.0, type=float,
help='Momentum for optimizer')
parser.add_argument('--l2', default=1e-5, type=float,
help='weight decay (L2 penalty)')
parser.add_argument('--checkpoint', type=str, default=None, help='checkpoint')
def train_net(args):
torch.manual_seed(7)
np.random.seed(7)
checkpoint = args.checkpoint
start_epoch = 0
best_loss = float('inf')
writer = SummaryWriter()
epochs_since_improvement = 0
# Initialize / load checkpoint
if checkpoint is None:
# model
encoder = Encoder(args.einput * args.LFR_m, args.ehidden, args.elayer,
dropout=args.edropout, bidirectional=args.ebidirectional,
rnn_type=args.etype)
decoder = Decoder(vocab_size, args.dembed, sos_id,
eos_id, args.dhidden, args.dlayer,
bidirectional_encoder=args.ebidirectional)
model = Seq2Seq(encoder, decoder)
print(model)
model.to(device)
optimizer = LasOptimizer(
torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98),
eps=1e-09)) # betas:一阶矩阵估计的指数衰减率估算值和二阶矩估计的指数衰减率 eps:以防止在实现中被零除
else:
checkpoint = torch.load(checkpoint)
start_epoch = checkpoint['epoch'] + 1
epochs_since_improvement = checkpoint['epochs_since_improvement']
model = checkpoint['model']
optimizer = checkpoint['optimizer']
logger = get_logger()
# Custom dataloaders
train_dataset = AiShellDataset(args, 'train')
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, collate_fn=pad_collate,
pin_memory=True, shuffle=True, num_workers=num_workers)
valid_dataset = AiShellDataset(args, 'dev')
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.batch_size, collate_fn=pad_collate,
pin_memory=True, shuffle=False,
num_workers=num_workers) # pin_memory=True:生成的Tensor数据最开始是属于内存中 collate_fn=pad_collate:将一个list的sample组成一个mini-batch的函数
# Epochs
for epoch in range(start_epoch, args.epochs):
# One epoch's training
train_loss = train(train_loader=train_loader,
model=model,
optimizer=optimizer,
epoch=epoch,
logger=logger)
writer.add_scalar('model/train_loss', train_loss, epoch)
lr = optimizer.lr
print('\nLearning rate: {}'.format(lr))
step_num = optimizer.step_num
print('Step num: {}\n'.format(step_num))
writer.add_scalar('model/learning_rate', lr, epoch)
# One epoch's validation
valid_loss = valid(valid_loader=valid_loader,
model=model,
logger=logger)
writer.add_scalar('model/valid_loss', valid_loss, epoch)
# Check if there was an improvement
is_best = valid_loss < best_loss
best_loss = min(valid_loss, best_loss)
if not is_best:
epochs_since_improvement += 1
print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,))
else:
epochs_since_improvement = 0
# Save checkpoint
save_checkpoint(epoch, epochs_since_improvement, model, optimizer, best_loss, is_best)
def train(train_loader, model, optimizer, epoch, logger):
model.train() # train mode (dropout and batchnorm is used)
losses = AverageMeter()
# Batches
for i, (data) in enumerate(train_loader):
# Move to GPU, if available
padded_input, padded_target, input_lengths = data
padded_input = padded_input.to(device)
padded_target = padded_target.to(device)
input_lengths = input_lengths.to(device)
# Forward prop.
loss = model(padded_input, input_lengths, padded_target.long())
# Back prop.
optimizer.zero_grad()
loss.backward()
# Update weights
optimizer.step()
# Keep track of metrics
losses.update(loss.item())
# Print status
if i % print_freq == 0:
logger.info('Epoch: [{0}][{1}/{2}]\t'
'Loss {loss.val:.4f} ({loss.avg:.4f})'.format(epoch, i, len(train_loader), loss=losses))
return losses.avg
def valid(valid_loader, model, logger):
model.eval()
losses = AverageMeter()
# Batches
for data in tqdm(valid_loader):
# Move to GPU, if available
padded_input, padded_target, input_lengths = data
padded_input = padded_input.to(device)
padded_target = padded_target.to(device)
input_lengths = input_lengths.to(device)
# Forward prop.
loss = model(padded_input, input_lengths, padded_target.long())
# Keep track of metrics
losses.update(loss.item())
# Print status
logger.info('\nValidation Loss {loss.val:.4f} ({loss.avg:.4f})\n'.format(loss=losses))
return losses.avg
def main():
global args
args = parser.parse_args()
train_net(args)
if __name__ == '__main__':
main()
# 开发人员:郭深
# 开发时间:2022/10/7 16:00
from utils.util import pad_list
\ No newline at end of file
# 开发人员:郭深
# 开发时间:2022/10/7 16:00
import logging
import torch
def get_logger():
logger = logging.getLogger()
handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s %(levelname)s \t%(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
return logger
def save_checkpoint(epoch, epochs_since_improvement, model, optimizer, loss, is_best):
state = {'epoch': epoch,
'epochs_since_improvement': epochs_since_improvement,
'loss': loss,
'model': model,
'optimizer': optimizer}
filename = 'checkpoint.tar'
torch.save(state, filename)
# If this checkpoint is the best so far, store a copy so it doesn't get overwritten by a worse checkpoint
if is_best:
torch.save(state, 'BEST_checkpoint.tar')
class AverageMeter(object):
"""
Keeps track of most recent, average, sum, and count of a metric.
"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def pad_list(xs, pad_value):
# From: espnet/src/nets/e2e_asr_th.py: pad_list()
n_batch = len(xs)
max_len = max(x.size(0) for x in xs)
pad = xs[0].new(n_batch, max_len, *xs[0].size()[1:]).fill_(pad_value)
for i in range(n_batch):
pad[i, :xs[i].size(0)] = xs[i]
return pad
def levenshtein(u, v):
prev = None
curr = [0] + list(range(1, len(v) + 1))
# Operations: (SUB, DEL, INS)
prev_ops = None
curr_ops = [(0, 0, i) for i in range(len(v) + 1)]
for x in range(1, len(u) + 1):
prev, curr = curr, [x] + ([None] * len(v))
prev_ops, curr_ops = curr_ops, [(0, x, 0)] + ([None] * len(v))
for y in range(1, len(v) + 1):
delcost = prev[y] + 1
addcost = curr[y - 1] + 1
subcost = prev[y - 1] + int(u[x - 1] != v[y - 1])
curr[y] = min(subcost, delcost, addcost)
if curr[y] == subcost:
(n_s, n_d, n_i) = prev_ops[y - 1]
curr_ops[y] = (n_s + int(u[x - 1] != v[y - 1]), n_d, n_i)
elif curr[y] == delcost:
(n_s, n_d, n_i) = prev_ops[y]
curr_ops[y] = (n_s, n_d + 1, n_i)
else:
(n_s, n_d, n_i) = curr_ops[y - 1]
curr_ops[y] = (n_s, n_d, n_i + 1)
return curr[len(v)], curr_ops[len(v)]
def load_file(fname, encoding):
try:
f = open(fname, 'r')
data = []
for line in f:
data.append(line.rstrip('\n').rstrip('\r').decode(encoding))
f.close()
except:
logging.error('Error reading file "%s"', fname)
exit(1)
return data
def cer_function(ref, hyp):
wer_s, wer_i, wer_d, wer_n = 0, 0, 0, 0
cer_s, cer_i, cer_d, cer_n = 0, 0, 0, 0
sen_err = 0
for n in range(len(ref)):
# update CER statistics
_, (s, i, d) = levenshtein(ref[n], hyp[n])
cer_s += s
cer_i += i
cer_d += d
cer_n += len(ref[n])
# update WER statistics
_, (s, i, d) = levenshtein(ref[n].split(), hyp[n].split())
wer_s += s
wer_i += i
wer_d += d
wer_n += len(ref[n].split())
# update SER statistics
if s + i + d > 0:
sen_err += 1
print(cer_s, cer_i, cer_d, cer_n)
return (cer_s + cer_i + cer_d) / cer_n
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册