未验证 提交 6750770e 编写于 作者: H Hui Zhang 提交者: GitHub

Merge pull request #1012 from zh794390558/datapipe

[asr] independent dataloader
# ASR
* s0 for deepspeech2
* s1 for u2/transformer/conformer
* asr0 - deepspeech2 Streaming/Non-Streaming
* asr1 - transformer/conformer Streaming/Non-Streaming
* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
## Data
......
......@@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--manifest_path="data/manifest.train.raw" \
--spectrum_type="linear" \
--delta_delta=false \
--stride_ms=10.0 \
--window_ms=20.0 \
--stride_ms=10 \
--window_ms=20 \
--sample_rate=16000 \
--use_dB_normalization=True \
--num_samples=2000 \
......@@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for dataset in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
--vocab_path="data/vocab.txt" \
......
......@@ -19,3 +19,13 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 |
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 |
| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | - | 0.059400 |
## Transformer
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 |
\ No newline at end of file
......@@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/augmentation.json
augmentation_config: conf/preprocess.yaml
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
......@@ -38,7 +38,7 @@ collator:
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
......
......@@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/augmentation.json
augmentation_config: conf/preprocess.yaml
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
......@@ -37,7 +37,7 @@ collator:
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
......
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 80
n_shift: 160
win_length: 400
dither: true
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false
# https://yaml.org/type/float.html
data:
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
min_input_len: 0.5
max_input_len: 20.0 # second
min_output_len: 0.0
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
collator:
vocab_filepath: data/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture
model:
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: null
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
training:
n_epoch: 120
accum_grad: 2
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.002
weight_decay: 1e-6
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
decoding:
batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
......@@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--stride_ms=10.0 \
--window_ms=25.0 \
--stride_ms=10 \
--window_ms=25 \
--sample_rate=16000 \
--use_dB_normalization=False \
--num_samples=-1 \
......@@ -67,7 +67,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for dataset in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
--vocab_path="data/vocab.txt" \
......
......@@ -23,8 +23,6 @@ fi
# exit 1
#fi
for type in attention_rescoring; do
echo "decoding ${type}"
batch_size=1
......
......@@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/augmentation.json
augmentation_config: conf/preprocess.yaml
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
......@@ -38,7 +38,7 @@ collator:
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
......
......@@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/augmentation.json
augmentation_config: conf/preprocess.yaml
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
......@@ -37,7 +37,7 @@ collator:
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
......
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 80
n_shift: 160
win_length: 400
dither: true
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false
......@@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--stride_ms=10.0 \
--window_ms=25.0 \
--stride_ms=10 \
--window_ms=25 \
--sample_rate=8000 \
--use_dB_normalization=False \
--num_samples=-1 \
......@@ -55,7 +55,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for dataset in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
--vocab_path="data/vocab.txt" \
......
......@@ -22,6 +22,7 @@ import argparse
import codecs
import json
import os
from pathlib import Path
import soundfile
......@@ -79,6 +80,7 @@ def create_manifest(data_dir, manifest_path_prefix):
audio_path = os.path.abspath(os.path.join(subfolder, fname))
audio_id = os.path.basename(fname)[:-4]
utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
......@@ -87,6 +89,7 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps(
{
'utt': audio_id,
'utt2spk': str(utt2spk),
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': text,
......
......@@ -22,6 +22,7 @@ import argparse
import codecs
import json
import os
from pathlib import Path
import soundfile
......@@ -81,6 +82,8 @@ def create_manifest(data_dir, manifest_path_prefix):
# if no transcription for audio then skipped
if audio_id not in transcript_dict:
continue
utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
text = transcript_dict[audio_id]
......@@ -88,6 +91,7 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps(
{
'utt': audio_id,
'utt2spk': str(utt2spk),
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': text
......
......@@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path):
print("Creating manifest %s ..." % manifest_path)
json_lines = []
total_sec = 0.0
total_text = 0.0
total_char = 0.0
total_num = 0
for subfolder, _, filelist in sorted(os.walk(data_dir)):
......@@ -89,25 +89,28 @@ def create_manifest(data_dir, manifest_path):
text_filepath = os.path.join(subfolder, text_filelist[0])
for line in io.open(text_filepath, encoding="utf8"):
segments = line.strip().split()
nchars = len(segments[1:])
text = ' '.join(segments[1:]).lower()
audio_filepath = os.path.abspath(
os.path.join(subfolder, segments[0] + '.flac'))
audio_data, samplerate = soundfile.read(audio_filepath)
duration = float(len(audio_data)) / samplerate
utt = os.path.splitext(os.path.basename(audio_filepath))[0]
utt2spk = '-'.join(utt.split('-')[:2])
json_lines.append(
json.dumps({
'utt':
os.path.splitext(os.path.basename(audio_filepath))[0],
'feat':
audio_filepath,
'feat_shape': (duration, ), #second
'text':
text
'utt': utt,
'utt2spk': utt2spk,
'feat': audio_filepath,
'feat_shape': (duration, ), # second
'text': text,
}))
total_sec += duration
total_text += len(text)
total_char += nchars
total_num += 1
with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
......@@ -122,8 +125,8 @@ def create_manifest(data_dir, manifest_path):
print(f"{subset}:", file=f)
print(f"{total_num} utts", file=f)
print(f"{total_sec / (60*60)} h", file=f)
print(f"{total_text} text", file=f)
print(f"{total_text / total_sec} text/sec", file=f)
print(f"{total_char} char", file=f)
print(f"{total_char / total_sec} char/sec", file=f)
print(f"{total_sec / total_num} sec/utt", file=f)
......
......@@ -74,15 +74,16 @@ def create_manifest(data_dir, manifest_path):
audio_filepath = os.path.join(subfolder, segments[0] + '.flac')
audio_data, samplerate = soundfile.read(audio_filepath)
duration = float(len(audio_data)) / samplerate
utt = os.path.splitext(os.path.basename(audio_filepath))[0]
utt2spk = '-'.join(utt.split('-')[:2])
json_lines.append(
json.dumps({
'utt':
os.path.splitext(os.path.basename(audio_filepath))[0],
'feat':
audio_filepath,
'utt': utt,
'utt2spk': utt2spk,
'feat': audio_filepath,
'feat_shape': (duration, ), #second
'text':
text
'text': text,
}))
total_sec += duration
......
......@@ -72,14 +72,17 @@ def create_manifest(data_dir, manifest_path_prefix):
continue
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
translation_str = " ".join(translation.split())
trancription_str = " ".join(trancription.split())
json_lines.append(
json.dumps(
{
'utt': utt,
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': " ".join(translation.split()),
'text1': " ".join(trancription.split())
'text': [translation_str, trancription_str],
},
ensure_ascii=False))
......
......@@ -113,6 +113,8 @@ def create_manifest(data_dir, manifest_path_prefix):
assert os.path.exists(audio_path) and os.path.exists(text_path)
audio_id = os.path.basename(audio_path)[:-4]
spk = audio_id.split('_')[0]
word_text, syllable_text, phone_text = read_trn(text_path)
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
......@@ -122,6 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps(
{
'utt': audio_id,
'utt2spk', spk,
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': word_text, # charactor
......
......@@ -180,12 +180,12 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps(
{
'utt': utt_id,
'utt2spk': spk,
'utt2gender': gender,
'feat': str(audio_path),
'feat_shape': (duration, ), # second
'text': word_text, # word
'phone': phone_text,
'spk': spk,
'gender': gender,
},
ensure_ascii=False))
......
......@@ -24,6 +24,7 @@ import json
import os
import soundfile
from pathlib import Path
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
......@@ -67,10 +68,17 @@ def create_manifest(data_dir, manifest_path_prefix):
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
text = phn_dict[audio_id]
gender_spk = str(Path(audio_path).parent.stem)
spk = gender_spk[1:]
gender = gender_spk[0]
utt_id = '_'.join([spk, gender, audio_id])
json_lines.append(
json.dumps(
{
'utt': audio_id,
'utt2spk': spk,
'utt2gender': gender,
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': text
......
......@@ -175,9 +175,12 @@ def generate_manifest(data_dir, manifest_path):
audio_data, samplerate = soundfile.read(u)
duration = float(len(audio_data)) / samplerate
utt = os.path.splitext(os.path.basename(u))[0]
json_lines.append(
json.dumps({
'utt': os.path.splitext(os.path.basename(u))[0],
'utt': utt,
'utt2spk': speaker,
'feat': u,
'feat_shape': (duration, ), #second
'text': trans.lower()
......
# ASR
* s0 is for deepspeech2 offline
* s1 is for transformer/conformer/U2
* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi
* asr0 - deepspeech2 Streaming/Non-Streaming
* asr1 - transformer/conformer Streaming/Non-Streaming
* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
## Data
| Data Subset | Duration in Seconds |
......
......@@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="linear" \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=20.0 \
--stride_ms=10 \
--window_ms=20 \
--use_dB_normalization=True \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"
......@@ -81,7 +81,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for set in train dev test dev-clean dev-other test-clean test-other; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \
......
......@@ -21,7 +21,7 @@
## Transformer
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 7.404532432556152 | 0.056204 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 7.404532432556152 | 0.058658 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 7.404532432556152 | 0.058278 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 7.404532432556152 | 0.045591 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.805267604192098, | 0.049795 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 |
\ No newline at end of file
......@@ -15,7 +15,7 @@ collator:
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/augmentation.json
augmentation_config: conf/preprocess.yaml
batch_size: 16
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
......@@ -38,7 +38,7 @@ collator:
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
......
......@@ -15,7 +15,7 @@ collator:
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/augmentation.json
augmentation_config: conf/preprocess.yaml
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
......@@ -38,7 +38,7 @@ collator:
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
......
......@@ -15,7 +15,7 @@ collator:
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/augmentation.json
augmentation_config: conf/preprocess.yaml
batch_size: 16
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
......@@ -38,7 +38,7 @@ collator:
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
......
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 80
n_shift: 160
win_length: 400
dither: true
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false
......@@ -15,7 +15,7 @@ collator:
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/augmentation.json
augmentation_config: conf/preprocess.yaml
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
......@@ -38,7 +38,7 @@ collator:
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
......
......@@ -8,6 +8,11 @@ nbpe=5000
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
stride_ms=10
window_ms=25
sample_rate=16000
feat_dim=80
source ${MAIN_ROOT}/utils/parse_options.sh
......@@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
exit 1
fi
for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
mv data/manifest.${set} data/manifest.${set}.raw
for sub in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
mv data/manifest.${sub} data/manifest.${sub}.raw
done
rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw
for set in train-clean-100 train-clean-360 train-other-500; do
cat data/manifest.${set}.raw >> data/manifest.train.raw
for sub in train-clean-100 train-clean-360 train-other-500; do
cat data/manifest.${sub}.raw >> data/manifest.train.raw
done
for set in dev-clean dev-other; do
cat data/manifest.${set}.raw >> data/manifest.dev.raw
for sub in dev-clean dev-other; do
cat data/manifest.${sub}.raw >> data/manifest.dev.raw
done
for set in test-clean test-other; do
cat data/manifest.${set}.raw >> data/manifest.test.raw
for sub in test-clean test-other; do
cat data/manifest.${sub}.raw >> data/manifest.test.raw
done
fi
......@@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--manifest_path="data/manifest.train.raw" \
--num_samples=-1 \
--spectrum_type="fbank" \
--feat_dim=80 \
--feat_dim=${feat_dim} \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=25.0 \
--sample_rate=${sample_rate} \
--stride_ms=${stride_ms} \
--window_ms=${window_ms} \
--use_dB_normalization=False \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"
......@@ -85,16 +90,15 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
for set in train dev test dev-clean dev-other test-clean test-other; do
for sub in train dev test dev-clean dev-other test-clean test-other; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.${set}.raw" \
--output_path="data/manifest.${set}"
--manifest_path="data/manifest.${sub}.raw" \
--output_path="data/manifest.${sub}"
if [ $? -ne 0 ]; then
echo "Formt mnaifest failed. Terminated."
......@@ -103,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
}&
done
wait
for sub in train dev; do
mv data/manifest.${sub} data/manifest.${sub}.fmt
done
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
for sub in train dev; do
remove_longshortdata.py --maxframes 3000 --maxchars 400 --stride_ms ${stride_ms} data/manifest.${sub}.fmt data/manifest.${sub}
done
fi
echo "LibriSpeech Data preparation done."
......
......@@ -50,7 +50,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for dataset in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.npz" \
--unit_type "char" \
--vocab_path="data/vocab.txt" \
......
......@@ -65,7 +65,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for set in train dev test dev-clean dev-other test-clean test-other; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.npz" \
--unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \
......
......@@ -63,7 +63,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for set in train dev test dev-clean dev-other test-clean test-other; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.npz" \
--unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \
......
# TED En -> Zh
* t0 for u2 speech translation
* st0 - conformer/transformer speech translation
......@@ -9,7 +9,7 @@ stop_stage=100
nbpe=8000
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
data_dir=./TED_EnZh
data_dir=./TED-En-Zh
source ${MAIN_ROOT}/utils/parse_options.sh
......@@ -21,7 +21,7 @@ mkdir -p data
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
if [ ! -e ${data_dir} ]; then
echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
echo "Error: ${data_dir} Dataset is not avaiable. Please download and unzip the dataset"
echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0"
echo "The tree of the directory should be:"
echo "."
......@@ -54,8 +54,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=25.0 \
--stride_ms=10 \
--window_ms=25 \
--use_dB_normalization=False \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"
......@@ -88,8 +88,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
for set in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_triplet_data.py \
--feat_type "raw" \
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
......
......@@ -22,7 +22,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
......
# thchs30
* a0 for mfa alignment
* align0 - mfa alignment
# TIMIT
* s1 u2 model with phone unit
asr model with phone unit
* asr0 - deepspeech2 Streaming/Non-Streaming
* asr1 - transformer/conformer Streaming/Non-Streaming
* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
\ No newline at end of file
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 80
n_shift: 160
win_length: 400
dither: true
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false
......@@ -14,7 +14,7 @@ collator:
vocab_filepath: data/vocab.txt
unit_type: "word"
mean_std_filepath: ""
augmentation_config: ""
augmentation_config: conf/preprocess.yaml
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
......@@ -37,7 +37,7 @@ collator:
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
......
......@@ -35,8 +35,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=25.0 \
--stride_ms=10 \
--window_ms=25 \
--use_dB_normalization=False \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"
......@@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
for set in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \
......
* s0 for deepspeech2
* s1 for U2
* asr0 - deepspeech2 Streaming/Non-Streaming
* asr1 - transformer/conformer Streaming/Non-Streaming
* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
......@@ -34,8 +34,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spectrum_type="linear" \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=20.0 \
--stride_ms=10 \
--window_ms=20 \
--use_dB_normalization=False \
--num_workers=2 \
--output_path="data/mean_std.json"
......@@ -63,7 +63,6 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \
......
......@@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200'
augmentation_config: conf/augmentation.json
augmentation_config: conf/preprocess.yaml
batch_size: 4
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
......
......@@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200'
augmentation_config: conf/augmentation.json
augmentation_config: conf/preprocess.yaml
batch_size: 4
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
......
......@@ -15,7 +15,7 @@ collator:
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200'
augmentation_config: conf/augmentation.json
augmentation_config: conf/preprocess.yaml
batch_size: 4
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
......
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 80
n_shift: 160
win_length: 400
dither: true
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false
......@@ -11,11 +11,11 @@ data:
max_output_input_ratio: 10.0
collator:
mean_std_filepath: ""
mean_std_filepath: data/mean_std.json
vocab_filepath: data/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/bpe_unigram_200'
augmentation_config: conf/augmentation.json
augmentation_config: conf/preprocess.yaml
batch_size: 4
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
......@@ -37,7 +37,7 @@ collator:
# network architecture
model:
cmvn_file: "data/mean_std.json"
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
......
......@@ -38,8 +38,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=25.0 \
--stride_ms=10 \
--window_ms=25 \
--use_dB_normalization=False \
--num_workers=2 \
--output_path="data/mean_std.json"
......@@ -69,7 +69,6 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
......
* asr0 - deepspeech2 Streaming/Non-Streaming
* asr1 - transformer/conformer Streaming/Non-Streaming
* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
# [WenetSpeech](https://github.com/wenet-e2e/WenetSpeech)
A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition
## Description
### Creation
All the data are collected from YouTube and Podcast. Optical character recognition (OCR) and automatic speech recognition (ASR) techniques are adopted to label each YouTube and Podcast recording, respectively. To improve the quality of the corpus, we use a novel end-to-end label error detection method to further validate and filter the data.
### Categories
In summary, WenetSpeech groups all data into 3 categories, as the following table shows:
| Set | Hours | Confidence | Usage |
|------------|-------|-------------|---------------------------------------|
| High Label | 10005 | >=0.95 | Supervised Training |
| Weak Label | 2478 | [0.6, 0.95] | Semi-supervised or noise training |
| Unlabel | 9952 | / | Unsupervised training or Pre-training |
| In Total | 22435 | / | All above |
### High Label Data
We classify the high label into 10 groups according to its domain, speaking style, and scenarios.
| Domain | Youtube | Podcast | Total |
|-------------|---------|---------|--------|
| audiobook | 0 | 250.9 | 250.9 |
| commentary | 112.6 | 135.7 | 248.3 |
| documentary | 386.7 | 90.5 | 477.2 |
| drama | 4338.2 | 0 | 4338.2 |
| interview | 324.2 | 614 | 938.2 |
| news | 0 | 868 | 868 |
| reading | 0 | 1110.2 | 1110.2 |
| talk | 204 | 90.7 | 294.7 |
| variety | 603.3 | 224.5 | 827.8 |
| others | 144 | 507.5 | 651.5 |
| Total | 6113 | 3892 | 10005 |
As shown in the following table, we provide 3 training subsets, namely `S`, `M` and `L` for building ASR systems on different data scales.
| Training Subsets | Confidence | Hours |
|------------------|-------------|-------|
| L | [0.95, 1.0] | 10005 |
| M | 1.0 | 1000 |
| S | 1.0 | 100 |
### Evaluation Sets
| Evaluation Sets | Hours | Source | Description |
|-----------------|-------|--------------|-----------------------------------------------------------------------------------------|
| DEV | 20 | Internet | Specially designed for some speech tools which require cross-validation set in training |
| TEST\_NET | 23 | Internet | Match test |
| TEST\_MEETING | 15 | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset |
\ No newline at end of file
## Pack Model
pack model to tar.gz, e.g.
```bash
./utils/pack_model.sh --preprocess_conf conf/preprocess.yaml --dict data/vocab.txt conf/conformer.yaml '' data/mean_std.json exp/conformer/checkpoints/wenetspeec
h.pdparams
```
show model.tar.gz
```
tar tf model.tar.gz
```
# WenetSpeech
## Conformer
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | dev | attention | | |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | ctc_greedy_search | | |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test meeting | ctc_prefix_beam_search | | |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | attention_rescoring | | |
## Conformer Pretrain Model
Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wenetspeech/20211025_conformer_exp.tar.gz
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention | - | 0.048456 |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | - | 0.052915 |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 |
\ No newline at end of file
# network architecture
model:
# encoder related
encoder: conformer
encoder_conf:
output_size: 512 # dimension of attention
attention_heads: 8
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: True
use_cnn_module: True
cnn_module_kernel: 15
cnn_module_norm: layer_norm
activation_type: swish
pos_enc_layer_type: rel_pos
selfattention_layer_type: rel_selfattn
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: null
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# https://yaml.org/type/float.html
data:
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
min_input_len: 0.1 # second
max_input_len: 12.0 # second
min_output_len: 1.0
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
collator:
vocab_filepath: data/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
training:
n_epoch: 240
accum_grad: 16
global_grad_clip: 5.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
optim: adam
optim_conf:
lr: 0.001
weight_decay: 1e-6
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 5000
lr_decay: 1.0
decoding:
batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.
\ No newline at end of file
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 80
n_shift: 160
win_length: 400
dither: true
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false
#!/bin/bash
# Copyright 2021 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
# NPU, ASLP Group (Author: Qijie Shao)
stage=-1
stop_stage=100
# Use your own data path. You need to download the WenetSpeech dataset by yourself.
wenetspeech_data_dir=./wenetspeech
# Make sure you have 1.2T for ${shards_dir}
shards_dir=./wenetspeech_shards
#wenetspeech training set
set=L
train_set=train_`echo $set | tr 'A-Z' 'a-z'`
dev_set=dev
test_sets="test_net test_meeting"
cmvn=true
cmvn_sampling_divisor=20 # 20 means 5% of the training data to estimate cmvn
. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
set -u
set -o pipefail
mkdir -p data
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then
# download data
echo "Please follow https://github.com/wenet-e2e/WenetSpeech to download the data."
exit 0;
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "Data preparation"
local/wenetspeech_data_prep.sh \
--train-subset $set \
$wenetspeech_data_dir \
data || exit 1;
fi
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# generate manifests
python3 ${TARGET_DIR}/aishell/aishell.py \
--manifest_prefix="data/manifest" \
--target_dir="${TARGET_DIR}/aishell"
if [ $? -ne 0 ]; then
echo "Prepare Aishell failed. Terminated."
exit 1
fi
for dataset in train dev test; do
mv data/manifest.${dataset} data/manifest.${dataset}.raw
done
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# compute mean and stddev for normalizer
if $cmvn; then
full_size=`cat data/${train_set}/wav.scp | wc -l`
sampling_size=$((full_size / cmvn_sampling_divisor))
shuf -n $sampling_size data/$train_set/wav.scp \
> data/$train_set/wav.scp.sampled
num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--stride_ms=10 \
--window_ms=25 \
--sample_rate=16000 \
--use_dB_normalization=False \
--num_samples=-1 \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"
if [ $? -ne 0 ]; then
echo "Compute mean and stddev failed. Terminated."
exit 1
fi
fi
fi
dict=data/dict/lang_char.txt
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# download data, generate manifests
# build vocabulary
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--manifest_paths "data/manifest.train.raw"
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
for dataset in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
--vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.${dataset}.raw" \
--output_path="data/manifest.${dataset}"
if [ $? -ne 0 ]; then
echo "Formt mnaifest failed. Terminated."
exit 1
fi
} &
done
wait
fi
echo "Aishell data preparation done."
exit 0
# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
# Mobvoi Inc(Author: Di Wu, Binbin Zhang)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os
import argparse
import json
def get_args():
parser = argparse.ArgumentParser(description="""
This script is used to process raw json dataset of WenetSpeech,
where the long wav is splitinto segments and
data of wenet format is generated.
""")
parser.add_argument('input_json', help="""Input json file of WenetSpeech""")
parser.add_argument('output_dir', help="""Output dir for prepared data""")
args = parser.parse_args()
return args
def meta_analysis(input_json, output_dir):
input_dir = os.path.dirname(input_json)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
try:
with open(input_json, 'r') as injson:
json_data = json.load(injson)
except Exception:
sys.exit(f'Failed to load input json file: {input_json}')
else:
if json_data['audios'] is not None:
with open(f'{output_dir}/text', 'w') as utt2text, \
open(f'{output_dir}/segments', 'w') as segments, \
open(f'{output_dir}/utt2dur', 'w') as utt2dur, \
open(f'{output_dir}/wav.scp', 'w') as wavscp, \
open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \
open(f'{output_dir}/reco2dur', 'w') as reco2dur:
for long_audio in json_data['audios']:
try:
long_audio_path = os.path.realpath(
os.path.join(input_dir, long_audio['path']))
aid = long_audio['aid']
segments_lists = long_audio['segments']
duration = long_audio['duration']
assert (os.path.exists(long_audio_path))
except AssertionError:
print(f'''Warning: {aid} something is wrong,
maybe AssertionError, skipped''')
continue
except Exception:
print(f'''Warning: {aid} something is wrong, maybe the
error path: {long_audio_path}, skipped''')
continue
else:
wavscp.write(f'{aid}\t{long_audio_path}\n')
reco2dur.write(f'{aid}\t{duration}\n')
for segment_file in segments_lists:
try:
sid = segment_file['sid']
start_time = segment_file['begin_time']
end_time = segment_file['end_time']
dur = end_time - start_time
text = segment_file['text']
segment_subsets = segment_file["subsets"]
except Exception:
print(f'''Warning: {segment_file} something
is wrong, skipped''')
continue
else:
utt2text.write(f'{sid}\t{text}\n')
segments.write(
f'{sid}\t{aid}\t{start_time}\t{end_time}\n'
)
utt2dur.write(f'{sid}\t{dur}\n')
segment_sub_names = " ".join(segment_subsets)
utt2subsets.write(
f'{sid}\t{segment_sub_names}\n')
def main():
args = get_args()
meta_analysis(args.input_json, args.output_dir)
if __name__ == '__main__':
main()
\ No newline at end of file
# Copyright 2021 NPU, ASLP Group (Author: Qijie Shao)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# process_opus.py: segmentation and downsampling of opus audio
# usage: python3 process_opus.py wav.scp segments output_wav.scp
from pydub import AudioSegment
import sys
import os
def read_file(wav_scp, segments):
wav_scp_dict = {}
with open(wav_scp, 'r', encoding='UTF-8') as fin:
for line_str in fin:
wav_id, path = line_str.strip().split()
wav_scp_dict[wav_id] = path
utt_list = []
seg_path_list = []
start_time_list = []
end_time_list = []
with open(segments, 'r', encoding='UTF-8') as fin:
for line_str in fin:
arr = line_str.strip().split()
assert len(arr) == 4
utt_list.append(arr[0])
seg_path_list.append(wav_scp_dict[arr[1]])
start_time_list.append(float(arr[2]))
end_time_list.append(float(arr[3]))
return utt_list, seg_path_list, start_time_list, end_time_list
# TODO(Qijie): Fix the process logic
def output(output_wav_scp, utt_list, seg_path_list, start_time_list,
end_time_list):
num_utts = len(utt_list)
step = int(num_utts * 0.01)
with open(output_wav_scp, 'w', encoding='UTF-8') as fout:
previous_wav_path = ""
for i in range(num_utts):
utt_id = utt_list[i]
current_wav_path = seg_path_list[i]
output_dir = (os.path.dirname(current_wav_path)) \
.replace("audio", 'audio_seg')
seg_wav_path = os.path.join(output_dir, utt_id + '.wav')
# if not os.path.exists(output_dir):
# os.makedirs(output_dir)
if current_wav_path != previous_wav_path:
source_wav = AudioSegment.from_file(current_wav_path)
previous_wav_path = current_wav_path
start = int(start_time_list[i] * 1000)
end = int(end_time_list[i] * 1000)
target_audio = source_wav[start:end].set_frame_rate(16000)
target_audio.export(seg_wav_path, format="wav")
fout.write("{} {}\n".format(utt_id, seg_wav_path))
if i % step == 0:
print("seg wav finished: {}%".format(int(i / step)))
def main():
wav_scp = sys.argv[1]
segments = sys.argv[2]
output_wav_scp = sys.argv[3]
utt_list, seg_path_list, start_time_list, end_time_list \
= read_file(wav_scp, segments)
output(output_wav_scp, utt_list, seg_path_list, start_time_list,
end_time_list)
if __name__ == '__main__':
main()
\ No newline at end of file
#!/bin/bash
if [ $# != 2 ];then
echo "usage: ${0} config_path ckpt_path_prefix"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
ckpt_prefix=$2
chunk_mode=false
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
chunk_mode=true
fi
# download language model
#bash local/download_lm_ch.sh
#if [ $? -ne 0 ]; then
# exit 1
#fi
for type in attention ctc_greedy_search; do
echo "decoding ${type}"
if [ ${chunk_mode} == true ];then
# stream decoding only support batchsize=1
batch_size=1
else
batch_size=64
fi
output_dir=${ckpt_prefix}
mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \
--opts decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
done
for type in ctc_prefix_beam_search attention_rescoring; do
echo "decoding ${type}"
batch_size=1
output_dir=${ckpt_prefix}
mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \
--opts decoding.batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
done
exit 0
#!/usr/bin/env bash
# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
# Seasalt AI, Inc (Author: Guoguo Chen)
# Mobvoi Inc(Author: Di Wu, Binbin Zhang)
# NPU, ASLP Group (Author: Qijie Shao)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
set -o pipefail
stage=1
prefix=
train_subset=L
. ./tools/parse_options.sh || exit 1;
filter_by_id () {
idlist=$1
input=$2
output=$3
field=1
if [ $# -eq 4 ]; then
field=$4
fi
cat $input | perl -se '
open(F, "<$idlist") || die "Could not open id-list file $idlist";
while(<F>) {
@A = split;
@A>=1 || die "Invalid id-list file line $_";
$seen{$A[0]} = 1;
}
while(<>) {
@A = split;
@A > 0 || die "Invalid file line $_";
@A >= $field || die "Invalid file line $_";
if ($seen{$A[$field-1]}) {
print $_;
}
}' -- -idlist="$idlist" -field="$field" > $output ||\
(echo "$0: filter_by_id() error: $input" && exit 1) || exit 1;
}
subset_data_dir () {
utt_list=$1
src_dir=$2
dest_dir=$3
mkdir -p $dest_dir || exit 1;
# wav.scp text segments utt2dur
filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\
(echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1;
filter_by_id $utt_list $src_dir/text $dest_dir/text ||\
(echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1;
filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\
(echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1;
awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco
filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\
(echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1;
rm -f $dest_dir/reco
}
if [ $# -ne 2 ]; then
echo "Usage: $0 [options] <wenetspeech-dataset-dir> <data-dir>"
echo " e.g.: $0 --train-subset L /disk1/audio_data/wenetspeech/ data/"
echo ""
echo "This script takes the WenetSpeech source directory, and prepares the"
echo "WeNet format data directory."
echo " --prefix <prefix> # Prefix for output data directory."
echo " --stage <stage> # Processing stage."
echo " --train-subset <L|M|S|W> # Train subset to be created."
exit 1
fi
wenetspeech_dir=$1
data_dir=$2
declare -A subsets
subsets=(
[L]="train_l"
[M]="train_m"
[S]="train_s"
[W]="train_w"
[DEV]="dev"
[TEST_NET]="test_net"
[TEST_MEETING]="test_meeting")
prefix=${prefix:+${prefix}_}
corpus_dir=$data_dir/${prefix}corpus/
if [ $stage -le 1 ]; then
echo "$0: Extract meta into $corpus_dir"
# Sanity check.
[ ! -f $wenetspeech_dir/WenetSpeech.json ] &&\
echo "$0: Please download $wenetspeech_dir/WenetSpeech.json!" && exit 1;
[ ! -d $wenetspeech_dir/audio ] &&\
echo "$0: Please download $wenetspeech_dir/audio!" && exit 1;
[ ! -d $corpus_dir ] && mkdir -p $corpus_dir
# Files to be created:
# wav.scp text segments utt2dur
python3 local/extract_meta.py \
$wenetspeech_dir/WenetSpeech.json $corpus_dir || exit 1;
fi
if [ $stage -le 2 ]; then
echo "$0: Split data to train, dev, test_net, and test_meeting"
[ ! -f $corpus_dir/utt2subsets ] &&\
echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1;
for label in $train_subset DEV TEST_NET TEST_MEETING; do
if [ ! ${subsets[$label]+set} ]; then
echo "$0: Subset $label is not defined in WenetSpeech.json." && exit 1;
fi
subset=${subsets[$label]}
[ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset
cat $corpus_dir/utt2subsets | \
awk -v s=$label '{for (i=2;i<=NF;i++) if($i==s) print $0;}' \
> $corpus_dir/${prefix}${subset}_utt_list|| exit 1;
subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \
$corpus_dir $data_dir/${prefix}$subset || exit 1;
done
fi
echo "$0: Done"
\ No newline at end of file
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
# model exp
MODEL=u2
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
#!/bin/bash
. path.sh || exit 1;
set -e
gpus=0,1,2,3,4,5,6,7
stage=0
stop_stage=100
conf_path=conf/conformer.yaml
average_checkpoint=true
avg_num=10
. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
avg_ckpt=avg_${avg_num}
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
echo "checkpoint name ${ckpt}"
audio_file="data/tmp.wav"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/data.sh || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# ctc alignment of test data
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
# test a single .wav file
CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
fi
../../../utils
\ No newline at end of file
......@@ -27,7 +27,9 @@ from paddle import distributed as dist
from paddle.io import DataLoader
from yacs.config import CfgNode
from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
from paddlespeech.s2t.io.collator import SpeechCollator
from paddlespeech.s2t.io.dataloader import BatchDataLoader
from paddlespeech.s2t.io.dataset import ManifestDataset
from paddlespeech.s2t.io.sampler import SortagradBatchSampler
from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
......@@ -249,92 +251,103 @@ class U2Trainer(Trainer):
def setup_dataloader(self):
config = self.config.clone()
config.defrost()
config.collator.keep_transcription_text = False
# train/valid dataset, return token ids
config.data.manifest = config.data.train_manifest
train_dataset = ManifestDataset.from_config(config)
config.data.manifest = config.data.dev_manifest
dev_dataset = ManifestDataset.from_config(config)
collate_fn_train = SpeechCollator.from_config(config)
config.collator.augmentation_config = ""
collate_fn_dev = SpeechCollator.from_config(config)
if self.parallel:
batch_sampler = SortagradDistributedBatchSampler(
train_dataset,
if self.train:
# train/valid dataset, return token ids
self.train_loader = BatchDataLoader(
json_file=config.data.train_manifest,
train_mode=True,
sortagrad=False,
batch_size=config.collator.batch_size,
num_replicas=None,
rank=None,
shuffle=True,
drop_last=True,
sortagrad=config.collator.sortagrad,
shuffle_method=config.collator.shuffle_method)
else:
batch_sampler = SortagradBatchSampler(
train_dataset,
shuffle=True,
maxlen_in=float('inf'),
maxlen_out=float('inf'),
minibatches=0,
mini_batch_size=self.args.nprocs,
batch_count='auto',
batch_bins=0,
batch_frames_in=0,
batch_frames_out=0,
batch_frames_inout=0,
preprocess_conf=config.collator.
augmentation_config, # aug will be off when train_mode=False
n_iter_processes=config.collator.num_workers,
subsampling_factor=1,
num_encs=1)
self.valid_loader = BatchDataLoader(
json_file=config.data.dev_manifest,
train_mode=False,
sortagrad=False,
batch_size=config.collator.batch_size,
drop_last=True,
sortagrad=config.collator.sortagrad,
shuffle_method=config.collator.shuffle_method)
self.train_loader = DataLoader(
train_dataset,
batch_sampler=batch_sampler,
collate_fn=collate_fn_train,
num_workers=config.collator.num_workers, )
self.valid_loader = DataLoader(
dev_dataset,
batch_size=config.collator.batch_size,
shuffle=False,
drop_last=False,
collate_fn=collate_fn_dev,
num_workers=config.collator.num_workers, )
# test dataset, return raw text
config.data.manifest = config.data.test_manifest
# filter test examples, will cause less examples, but no mismatch with training
# and can use large batch size , save training time, so filter test egs now.
config.data.min_input_len = 0.0 # second
config.data.max_input_len = float('inf') # second
config.data.min_output_len = 0.0 # tokens
config.data.max_output_len = float('inf') # tokens
config.data.min_output_input_ratio = 0.00
config.data.max_output_input_ratio = float('inf')
test_dataset = ManifestDataset.from_config(config)
# return text ord id
config.collator.keep_transcription_text = True
config.collator.augmentation_config = ""
self.test_loader = DataLoader(
test_dataset,
batch_size=config.decoding.batch_size,
shuffle=False,
drop_last=False,
collate_fn=SpeechCollator.from_config(config),
num_workers=config.collator.num_workers, )
# return text token id
config.collator.keep_transcription_text = False
self.align_loader = DataLoader(
test_dataset,
batch_size=config.decoding.batch_size,
shuffle=False,
drop_last=False,
collate_fn=SpeechCollator.from_config(config),
num_workers=config.collator.num_workers, )
logger.info("Setup train/valid/test/align Dataloader!")
maxlen_in=float('inf'),
maxlen_out=float('inf'),
minibatches=0,
mini_batch_size=self.args.nprocs,
batch_count='auto',
batch_bins=0,
batch_frames_in=0,
batch_frames_out=0,
batch_frames_inout=0,
preprocess_conf=config.collator.
augmentation_config, # aug will be off when train_mode=False
n_iter_processes=config.collator.num_workers,
subsampling_factor=1,
num_encs=1)
logger.info("Setup train/valid Dataloader!")
else:
# test dataset, return raw text
self.test_loader = BatchDataLoader(
json_file=config.data.test_manifest,
train_mode=False,
sortagrad=False,
batch_size=config.decoding.batch_size,
maxlen_in=float('inf'),
maxlen_out=float('inf'),
minibatches=0,
mini_batch_size=1,
batch_count='auto',
batch_bins=0,
batch_frames_in=0,
batch_frames_out=0,
batch_frames_inout=0,
preprocess_conf=config.collator.
augmentation_config, # aug will be off when train_mode=False
n_iter_processes=1,
subsampling_factor=1,
num_encs=1)
self.align_loader = BatchDataLoader(
json_file=config.data.test_manifest,
train_mode=False,
sortagrad=False,
batch_size=config.decoding.batch_size,
maxlen_in=float('inf'),
maxlen_out=float('inf'),
minibatches=0,
mini_batch_size=1,
batch_count='auto',
batch_bins=0,
batch_frames_in=0,
batch_frames_out=0,
batch_frames_inout=0,
preprocess_conf=config.collator.
augmentation_config, # aug will be off when train_mode=False
n_iter_processes=1,
subsampling_factor=1,
num_encs=1)
logger.info("Setup test/align Dataloader!")
def setup_model(self):
config = self.config
model_conf = config.model
with UpdateConfig(model_conf):
model_conf.input_dim = self.train_loader.collate_fn.feature_size
model_conf.output_dim = self.train_loader.collate_fn.vocab_size
if self.train:
model_conf.input_dim = self.train_loader.feat_dim
model_conf.output_dim = self.train_loader.vocab_size
else:
model_conf.input_dim = self.test_loader.feat_dim
model_conf.output_dim = self.test_loader.vocab_size
model = U2Model.from_config(model_conf)
......@@ -343,6 +356,11 @@ class U2Trainer(Trainer):
logger.info(f"{model}")
layer_tools.print_params(model, logger.info)
self.model = model
logger.info("Setup model!")
if not self.train:
return
train_config = config.training
optim_type = train_config.optim
......@@ -383,10 +401,9 @@ class U2Trainer(Trainer):
optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
self.model = model
self.optimizer = optimizer
self.lr_scheduler = lr_scheduler
logger.info("Setup model/optimizer/lr_scheduler!")
logger.info("Setup optimizer/lr_scheduler!")
class U2Tester(U2Trainer):
......@@ -421,14 +438,19 @@ class U2Tester(U2Trainer):
def __init__(self, config, args):
super().__init__(config, args)
self.text_feature = TextFeaturizer(
unit_type=self.config.collator.unit_type,
vocab_filepath=self.config.collator.vocab_filepath,
spm_model_prefix=self.config.collator.spm_model_prefix)
self.vocab_list = self.text_feature.vocab_list
def ordid2token(self, texts, texts_len):
def id2token(self, texts, texts_len, text_feature):
""" ord() id to chr() chr """
trans = []
for text, n in zip(texts, texts_len):
n = n.numpy().item()
ids = text[:n]
trans.append(''.join([chr(i) for i in ids]))
trans.append(text_feature.defeaturize(ids.numpy().tolist()))
return trans
def compute_metrics(self,
......@@ -444,12 +466,11 @@ class U2Tester(U2Trainer):
error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
start_time = time.time()
text_feature = self.test_loader.collate_fn.text_feature
target_transcripts = self.ordid2token(texts, texts_len)
target_transcripts = self.id2token(texts, texts_len, self.text_feature)
result_transcripts, result_tokenids = self.model.decode(
audio,
audio_len,
text_feature=text_feature,
text_feature=self.text_feature,
decoding_method=cfg.decoding_method,
lang_model_path=cfg.lang_model_path,
beam_alpha=cfg.alpha,
......@@ -499,7 +520,7 @@ class U2Tester(U2Trainer):
self.model.eval()
logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
stride_ms = self.test_loader.collate_fn.stride_ms
stride_ms = self.config.collator.stride_ms
error_rate_type = None
errors_sum, len_refs, num_ins = 0.0, 0, 0
num_frames = 0.0
......@@ -558,8 +579,7 @@ class U2Tester(U2Trainer):
def align(self):
ctc_utils.ctc_align(self.config, self.model, self.align_loader,
self.config.decoding.batch_size,
self.align_loader.collate_fn.stride_ms,
self.align_loader.collate_fn.vocab_list,
self.config.collator.stride_ms, self.vocab_list,
self.args.result_file)
def load_inferspec(self):
......@@ -573,7 +593,7 @@ class U2Tester(U2Trainer):
infer_model = U2InferModel.from_pretrained(self.test_loader,
self.config.model.clone(),
self.args.checkpoint_path)
feat_dim = self.test_loader.collate_fn.feature_size
feat_dim = self.test_loader.feat_dim
input_spec = [
paddle.static.InputSpec(shape=[1, None, feat_dim],
dtype='float32'), # audio, [B,T,D]
......
......@@ -392,6 +392,7 @@ class U2Tester(U2Trainer):
unit_type=self.config.collator.unit_type,
vocab_filepath=self.config.collator.vocab_filepath,
spm_model_prefix=self.config.collator.spm_model_prefix)
self.vocab_list = self.text_feature.vocab_list
def id2token(self, texts, texts_len, text_feature):
""" ord() id to chr() chr """
......@@ -529,8 +530,7 @@ class U2Tester(U2Trainer):
def align(self):
ctc_utils.ctc_align(self.config, self.model, self.align_loader,
self.config.decoding.batch_size,
self.align_loader.collate_fn.stride_ms,
self.align_loader.collate_fn.vocab_list,
self.config.collator.stride_ms, self.vocab_list,
self.args.result_file)
def load_inferspec(self):
......
......@@ -24,6 +24,8 @@ import soundfile
import soxbindings as sox
from scipy import signal
from .utility import convert_samples_from_float32
from .utility import convert_samples_to_float32
from .utility import subfile_from_tar
......@@ -689,15 +691,7 @@ class AudioSegment():
Audio sample type is usually integer or float-point.
Integers will be scaled to [-1, 1] in float32.
"""
float32_samples = samples.astype('float32')
if samples.dtype in np.sctypes['int']:
bits = np.iinfo(samples.dtype).bits
float32_samples *= (1. / 2**(bits - 1))
elif samples.dtype in np.sctypes['float']:
pass
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return float32_samples
return convert_samples_to_float32(samples)
def _convert_samples_from_float32(self, samples, dtype):
"""Convert sample type from float32 to dtype.
......@@ -708,20 +702,4 @@ class AudioSegment():
This is for writing a audio file.
"""
dtype = np.dtype(dtype)
output_samples = samples.copy()
if dtype in np.sctypes['int']:
bits = np.iinfo(dtype).bits
output_samples *= (2**(bits - 1) / 1.)
min_val = np.iinfo(dtype).min
max_val = np.iinfo(dtype).max
output_samples[output_samples > max_val] = max_val
output_samples[output_samples < min_val] = min_val
elif samples.dtype in np.sctypes['float']:
min_val = np.finfo(dtype).min
max_val = np.finfo(dtype).max
output_samples[output_samples > max_val] = max_val
output_samples[output_samples < min_val] = min_val
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return output_samples.astype(dtype)
return convert_samples_from_float32(samples, dtype)
......@@ -92,7 +92,9 @@ class TextFeaturizer():
tokens = self.tokenize(text)
ids = []
for token in tokens:
token = token if token in self.vocab_dict else self.unk
if token not in self.vocab_dict:
logger.debug(f"Text Token: {token} -> {self.unk}")
token = self.unk
ids.append(self.vocab_dict[token])
return ids
......
......@@ -30,7 +30,8 @@ logger = Log(__name__).getlog()
__all__ = [
"load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
"max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
"EOS", "UNK", "BLANK", "MASKCTC", "SPACE"
"EOS", "UNK", "BLANK", "MASKCTC", "SPACE", "convert_samples_to_float32",
"convert_samples_from_float32"
]
IGNORE_ID = -1
......@@ -342,3 +343,50 @@ def load_cmvn(cmvn_file: str, filetype: str):
else:
raise ValueError(f"cmvn file type no support: {filetype}")
return cmvn[0], cmvn[1]
def convert_samples_to_float32(samples):
"""Convert sample type to float32.
Audio sample type is usually integer or float-point.
Integers will be scaled to [-1, 1] in float32.
PCM16 -> PCM32
"""
float32_samples = samples.astype('float32')
if samples.dtype in np.sctypes['int']:
bits = np.iinfo(samples.dtype).bits
float32_samples *= (1. / 2**(bits - 1))
elif samples.dtype in np.sctypes['float']:
pass
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return float32_samples
def convert_samples_from_float32(samples, dtype):
"""Convert sample type from float32 to dtype.
Audio sample type is usually integer or float-point. For integer
type, float32 will be rescaled from [-1, 1] to the maximum range
supported by the integer type.
PCM32 -> PCM16
"""
dtype = np.dtype(dtype)
output_samples = samples.copy()
if dtype in np.sctypes['int']:
bits = np.iinfo(dtype).bits
output_samples *= (2**(bits - 1) / 1.)
min_val = np.iinfo(dtype).min
max_val = np.iinfo(dtype).max
output_samples[output_samples > max_val] = max_val
output_samples[output_samples < min_val] = min_val
elif samples.dtype in np.sctypes['float']:
min_val = np.finfo(dtype).min
max_val = np.finfo(dtype).max
output_samples[output_samples > max_val] = max_val
output_samples[output_samples < min_val] = min_val
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return output_samples.astype(dtype)
......@@ -199,8 +199,8 @@ class SpeechCollatorBase():
for idx, item in enumerate(batch):
utts.append(item['utt'])
audio = item['feat']
text = item['text']
audio = item['input'][0]['feat']
text = item['output'][0]['text']
audio, text = self.process_utterance(audio, text)
audios.append(audio) # [T, D]
......@@ -343,9 +343,10 @@ class TripletSpeechCollator(SpeechCollator):
for idx, item in enumerate(batch):
utts.append(item['utt'])
audio = item['feat']
translation = item['text']
transcription = item['text1']
audio = item['input'][0]['feat']
translation = item['output'][0]['text']
transcription = item['output'][1]['text']
audio, translation, transcription = self.process_utterance(
audio, translation, transcription)
......
......@@ -103,7 +103,7 @@ class ManifestDataset(Dataset):
min_output_len=min_output_len,
max_output_input_ratio=max_output_input_ratio,
min_output_input_ratio=min_output_input_ratio)
self._manifest.sort(key=lambda x: x["feat_shape"][0])
self._manifest.sort(key=lambda x: x["input"][0]["shape"][0])
def __len__(self):
return len(self._manifest)
......@@ -188,34 +188,16 @@ class AudioDataset(Dataset):
if sort:
data = sorted(data, key=lambda x: x["feat_shape"][0])
if raw_wav:
assert data[0]['feat'].split(':')[0].splitext()[-1] not in ('.ark',
'.scp')
data = map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms))
path_suffix = data[0]['feat'].split(':')[0].splitext()[-1]
assert path_suffix not in ('.ark', '.scp')
# m second to n frame
data = list(
map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms),
data))
self.input_dim = data[0]['feat_shape'][1]
self.output_dim = data[0]['token_shape'][1]
# with open(data_file, 'r') as f:
# for line in f:
# arr = line.strip().split('\t')
# if len(arr) != 7:
# continue
# key = arr[0].split(':')[1]
# tokenid = arr[5].split(':')[1]
# output_dim = int(arr[6].split(':')[1].split(',')[1])
# if raw_wav:
# wav_path = ':'.join(arr[1].split(':')[1:])
# duration = int(float(arr[2].split(':')[1]) * 1000 / 10)
# data.append((key, wav_path, duration, tokenid))
# else:
# feat_ark = ':'.join(arr[1].split(':')[1:])
# feat_info = arr[2].split(':')[1].split(',')
# feat_dim = int(feat_info[1].strip())
# num_frames = int(feat_info[0].strip())
# data.append((key, feat_ark, num_frames, tokenid))
# self.input_dim = feat_dim
# self.output_dim = output_dim
valid_data = []
for i in range(len(data)):
length = data[i]['feat_shape'][0]
......@@ -223,17 +205,17 @@ class AudioDataset(Dataset):
# remove too lang or too short utt for both input and output
# to prevent from out of memory
if length > max_length or length < min_length:
# logging.warn('ignore utterance {} feature {}'.format(
# data[i][0], length))
pass
elif token_length > token_max_length or token_length < token_min_length:
pass
else:
valid_data.append(data[i])
logger.info(f"raw dataset len: {len(data)}")
data = valid_data
num_data = len(data)
logger.info(f"dataset len after filter: {num_data}")
self.minibatch = []
num_data = len(data)
# Dynamic batch size
if batch_type == 'dynamic':
assert (max_frames_in_batch > 0)
......@@ -258,7 +240,9 @@ class AudioDataset(Dataset):
cur = end
def __len__(self):
"""number of example(batch)"""
return len(self.minibatch)
def __getitem__(self, idx):
"""batch example of idx"""
return self.minibatch[idx]
......@@ -18,8 +18,10 @@ import kaldiio
import numpy as np
import soundfile
from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation
from .utility import feat_type
from paddlespeech.s2t.transform.transformation import Transformation
from paddlespeech.s2t.utils.log import Log
# from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation
__all__ = ["LoadInputsAndTargets"]
......@@ -322,20 +324,7 @@ class LoadInputsAndTargets():
"Not supported: loader_type={}".format(filetype))
def file_type(self, filepath):
suffix = filepath.split(":")[0].split('.')[-1].lower()
if suffix == 'ark':
return 'mat'
elif suffix == 'scp':
return 'scp'
elif suffix == 'npy':
return 'npy'
elif suffix == 'npz':
return 'npz'
elif suffix in ['wav', 'flac']:
# PCM16
return 'sound'
else:
raise ValueError(f"Not support filetype: {suffix}")
return feat_type(filepath)
class SoundHDF5File():
......
......@@ -17,7 +17,7 @@ import numpy as np
from paddlespeech.s2t.utils.log import Log
__all__ = ["pad_list", "pad_sequence"]
__all__ = ["pad_list", "pad_sequence", "feat_type"]
logger = Log(__name__).getlog()
......@@ -85,3 +85,20 @@ def pad_sequence(sequences: List[np.ndarray],
out_tensor[:length, i, ...] = tensor
return out_tensor
def feat_type(filepath):
suffix = filepath.split(":")[0].split('.')[-1].lower()
if suffix == 'ark':
return 'mat'
elif suffix == 'scp':
return 'scp'
elif suffix == 'npy':
return 'npy'
elif suffix == 'npz':
return 'npz'
elif suffix in ['wav', 'flac']:
# PCM16
return 'sound'
else:
raise ValueError(f"Not support filetype: {suffix}")
......@@ -860,7 +860,7 @@ class U2Model(U2DecodeModel):
int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
"""
# cmvn
if 'cmvn_file' in configs and configs['cmvn_file'] is not None:
if 'cmvn_file' in configs and configs['cmvn_file']:
mean, istd = load_cmvn(configs['cmvn_file'],
configs['cmvn_file_type'])
global_cmvn = GlobalCMVN(
......@@ -934,8 +934,8 @@ class U2Model(U2DecodeModel):
DeepSpeech2Model: The model built from pretrained result.
"""
with UpdateConfig(config):
config.input_dim = dataloader.collate_fn.feature_size
config.output_dim = dataloader.collate_fn.vocab_size
config.input_dim = dataloader.feat_dim
config.output_dim = dataloader.vocab_size
model = cls.from_config(config)
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2019 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
......@@ -13,6 +13,7 @@
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
import io
import json
import h5py
import kaldiio
......@@ -157,3 +158,40 @@ class UtteranceCMVN():
x = np.divide(x, std)
return x
class GlobalCMVN():
"Apply Global CMVN"
def __init__(self,
cmvn_path,
norm_means=True,
norm_vars=True,
std_floor=1.0e-20):
self.cmvn_path = cmvn_path
self.norm_means = norm_means
self.norm_vars = norm_vars
self.std_floor = std_floor
with open(cmvn_path) as f:
cmvn_stats = json.load(f)
self.count = cmvn_stats['frame_num']
self.mean = np.array(cmvn_stats['mean_stat']) / self.count
self.square_sums = np.array(cmvn_stats['var_stat'])
self.var = self.square_sums / self.count - self.mean**2
self.std = np.maximum(np.sqrt(self.var), self.std_floor)
def __repr__(self):
return f"""{self.__class__.__name__}(
cmvn_path={self.cmvn_path},
norm_means={self.norm_means},
norm_vars={self.norm_vars},)"""
def __call__(self, x, uttid=None):
# x: [Time, Dim]
if self.norm_means:
x = np.subtract(x, self.mean)
if self.norm_vars:
x = np.divide(x, self.std)
return x
......@@ -16,6 +16,7 @@ import librosa
import numpy
import scipy
import soundfile
import soxbindings as sox
from paddlespeech.s2t.io.reader import SoundHDF5File
......@@ -82,7 +83,6 @@ class SpeedPerturbation():
def __call__(self, x, uttid=None, train=True):
if not train:
return x
x = x.astype(numpy.float32)
if self.accept_uttid:
ratio = self.utt2ratio[uttid]
......@@ -108,6 +108,110 @@ class SpeedPerturbation():
return y
class SpeedPerturbationSox():
"""SpeedPerturbationSox
The speed perturbation in kaldi uses sox-speed instead of sox-tempo,
and sox-speed just to resample the input,
i.e pitch and tempo are changed both.
To speed up or slow down the sound of a file,
use speed to modify the pitch and the duration of the file.
This raises the speed and reduces the time.
The default factor is 1.0 which makes no change to the audio.
2.0 doubles speed, thus time length is cut by a half and pitch is one interval higher.
"Why use speed option instead of tempo -s in SoX for speed perturbation"
https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8
tempo option:
sox -t wav input.wav -t wav output.tempo0.9.wav tempo -s 0.9
speed option:
sox -t wav input.wav -t wav output.speed0.9.wav speed 0.9
If we use speed option like above, the pitch of audio also will be changed,
but the tempo option does not change the pitch.
"""
def __init__(
self,
lower=0.9,
upper=1.1,
utt2ratio=None,
keep_length=True,
sr=16000,
seed=None, ):
self.sr = sr
self.keep_length = keep_length
self.state = numpy.random.RandomState(seed)
if utt2ratio is not None:
self.utt2ratio = {}
# Use the scheduled ratio for each utterances
self.utt2ratio_file = utt2ratio
self.lower = None
self.upper = None
self.accept_uttid = True
with open(utt2ratio, "r") as f:
for line in f:
utt, ratio = line.rstrip().split(None, 1)
ratio = float(ratio)
self.utt2ratio[utt] = ratio
else:
self.utt2ratio = None
# The ratio is given on runtime randomly
self.lower = lower
self.upper = upper
def __repr__(self):
if self.utt2ratio is None:
return f"""{self.__class__.__name__}(
lower={self.lower},
upper={self.upper},
keep_length={self.keep_length},
sample_rate={self.sr})"""
else:
return f"""{self.__class__.__name__}(
utt2ratio={self.utt2ratio_file},
sample_rate={self.sr})"""
def __call__(self, x, uttid=None, train=True):
if not train:
return x
x = x.astype(numpy.float32)
if self.accept_uttid:
ratio = self.utt2ratio[uttid]
else:
ratio = self.state.uniform(self.lower, self.upper)
tfm = sox.Transformer()
tfm.set_globals(multithread=False)
tfm.speed(ratio)
y = tfm.build_array(input_array=x, sample_rate_in=self.sr)
if self.keep_length:
diff = abs(len(x) - len(y))
if len(y) > len(x):
# Truncate noise
y = y[diff // 2:-((diff + 1) // 2)]
elif len(y) < len(x):
# Assume the time-axis is the first: (Time, Channel)
pad_width = [(diff // 2, (diff + 1) // 2)] + [
(0, 0) for _ in range(y.ndim - 1)
]
y = numpy.pad(
y, pad_width=pad_width, constant_values=0, mode="constant")
if y.ndim == 2 and x.ndim == 1:
# (T, C) -> (T)
y = y.sequence(1)
return y
class BandpassPerturbation():
"""BandpassPerturbation
......
......@@ -34,6 +34,9 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"):
:returns numpy.ndarray: time warped spectrogram (time, freq)
"""
window = max_time_warp
if window == 0:
return x
if mode == "PIL":
t = x.shape[0]
if t - window <= window:
......
......@@ -14,6 +14,7 @@
# Modified from espnet(https://github.com/espnet/espnet)
import librosa
import numpy as np
from python_speech_features import logfbank
def stft(x,
......@@ -304,3 +305,94 @@ class IStft():
win_length=self.win_length,
window=self.window,
center=self.center, )
class LogMelSpectrogramKaldi():
def __init__(
self,
fs=16000,
n_mels=80,
n_fft=512, # fft point
n_shift=160, # unit:sample, 10ms
win_length=400, # unit:sample, 25ms
window="povey",
fmin=20,
fmax=None,
eps=1e-10,
dither=False):
self.fs = fs
self.n_mels = n_mels
self.n_fft = n_fft
if n_shift > win_length:
raise ValueError("Stride size must not be greater than "
"window size.")
self.n_shift = n_shift / fs # unit: ms
self.win_length = win_length / fs # unit: ms
self.window = window
self.fmin = fmin
if fmax is None:
fmax_ = fmax if fmax else self.fs / 2
elif fmax > int(self.fs / 2):
raise ValueError("fmax must not be greater than half of "
"sample rate.")
self.fmax = fmax_
self.eps = eps
self.remove_dc_offset = True
self.preemph = 0.97
self.dither = dither
def __repr__(self):
return (
"{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
"n_shift={n_shift}, win_length={win_length}, preemph={preemph}, window={window}, "
"fmin={fmin}, fmax={fmax}, eps={eps}, dither={dither}))".format(
name=self.__class__.__name__,
fs=self.fs,
n_mels=self.n_mels,
n_fft=self.n_fft,
n_shift=self.n_shift,
preemph=self.preemph,
win_length=self.win_length,
window=self.window,
fmin=self.fmin,
fmax=self.fmax,
eps=self.eps,
dither=self.dither, ))
def __call__(self, x):
"""
Args:
x (np.ndarray): shape (Ti,)
Raises:
ValueError: not support (Ti, C)
Returns:
np.ndarray: (T, D)
"""
if x.ndim != 1:
raise ValueError("Not support x: [Time, Channel]")
if x.dtype in np.sctypes['float']:
# PCM32 -> PCM16
bits = np.iinfo(np.int16).bits
x = x * 2**(bits - 1)
# logfbank need PCM16 input
y = logfbank(
signal=x,
samplerate=self.fs,
winlen=self.win_length, # unit ms
winstep=self.n_shift, # unit ms
nfilt=self.n_mels,
nfft=self.n_fft,
lowfreq=self.fmin,
highfreq=self.fmax,
dither=self.dither,
remove_dc_offset=self.remove_dc_offset,
preemph=self.preemph,
wintype=self.window)
return y
......@@ -45,7 +45,8 @@ import_alias = dict(
stft2fbank="paddlespeech.s2t.transform.spectrogram:Stft2LogMelSpectrogram",
wpe="paddlespeech.s2t.transform.wpe:WPE",
channel_selector="paddlespeech.s2t.transform.channel_selector:ChannelSelector",
)
fbank_kaldi="paddlespeech.s2t.transform.spectrogram:LogMelSpectrogramKaldi",
cmvn_json="paddlespeech.s2t.transform.cmvn:GlobalCMVN")
class Transformation():
......
......@@ -33,8 +33,8 @@ add_arg('spectrum_type', str,
choices=['linear', 'mfcc', 'fbank'])
add_arg('feat_dim', int, 13, "Audio feature dim.")
add_arg('delta_delta', bool, False, "Audio feature with delta delta.")
add_arg('stride_ms', float, 10.0, "stride length in ms.")
add_arg('window_ms', float, 20.0, "stride length in ms.")
add_arg('stride_ms', int, 10, "stride length in ms.")
add_arg('window_ms', int, 20, "stride length in ms.")
add_arg('sample_rate', int, 16000, "target sample rate.")
add_arg('use_dB_normalization', bool, True, "do dB normalization.")
add_arg('target_dB', int, -20, "target dB.")
......@@ -61,8 +61,8 @@ def main():
spectrum_type=args.spectrum_type,
feat_dim=args.feat_dim,
delta_delta=args.delta_delta,
stride_ms=args.stride_ms,
window_ms=args.window_ms,
stride_ms=float(args.stride_ms),
window_ms=float(args.window_ms),
n_fft=None,
max_freq=None,
target_sample_rate=args.sample_rate,
......
......@@ -20,13 +20,13 @@ import json
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.io.utility import feat_type
from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), mat(ark), scp")
add_arg('cmvn_path', str,
'examples/librispeech/data/mean_std.json',
"Filepath of cmvn.")
......@@ -62,27 +62,76 @@ def main():
vocab_size = text_feature.vocab_size
print(f"Vocab size: {vocab_size}")
# josnline like this
# {
# "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
# "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
# "utt2spk": "111-2222",
# "utt": "111-2222-333"
# }
count = 0
for manifest_path in args.manifest_paths:
manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons:
output_json = {
"input": [],
"output": [],
'utt': line_json['utt'],
'utt2spk': line_json.get('utt2spk', 'global'),
}
# output
line = line_json['text']
tokens = text_feature.tokenize(line)
tokenids = text_feature.featurize(line)
line_json['token'] = tokens
line_json['token_id'] = tokenids
line_json['token_shape'] = (len(tokenids), vocab_size)
feat_shape = line_json['feat_shape']
assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
if args.feat_type == 'raw':
feat_shape.append(feat_dim)
line_json['filetype'] = 'sound'
else: # kaldi
raise NotImplementedError('no support kaldi feat now!')
fout.write(json.dumps(line_json) + '\n')
if isinstance(line, str):
# only one target
tokens = text_feature.tokenize(line)
tokenids = text_feature.featurize(line)
output_json['output'].append({
'name': 'target1',
'shape': (len(tokenids), vocab_size),
'text': line,
'token': ' '.join(tokens),
'tokenid': ' '.join(map(str, tokenids)),
})
else:
# isinstance(line, list), multi target in one vocab
for i, item in enumerate(line, 1):
tokens = text_feature.tokenize(item)
tokenids = text_feature.featurize(item)
output_json['output'].append({
'name': f'target{i}',
'shape': (len(tokenids), vocab_size),
'text': item,
'token': ' '.join(tokens),
'tokenid': ' '.join(map(str, tokenids)),
})
# input
line = line_json['feat']
if isinstance(line, str):
# only one input
feat_shape = line_json['feat_shape']
assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
filetype = feat_type(line)
if filetype == 'sound':
feat_shape.append(feat_dim)
else: # kaldi
raise NotImplementedError('no support kaldi feat now!')
output_json['input'].append({
"name": "input1",
"shape": feat_shape,
"feat": line,
"filetype": filetype,
})
else:
# isinstance(line, list), multi input
raise NotImplementedError("not support multi input now!")
fout.write(json.dumps(output_json) + '\n')
count += 1
print(f"Examples number: {count}")
print(f"{args.manifest_paths} Examples number: {count}")
fout.close()
......
......@@ -20,13 +20,13 @@ import json
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.io.utility import feat_type
from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kaldi")
add_arg('cmvn_path', str,
'examples/librispeech/data/mean_std.json',
"Filepath of cmvn.")
......@@ -79,9 +79,11 @@ def main():
line_json['token1'] = tokens
line_json['token_id1'] = tokenids
line_json['token_shape1'] = (len(tokenids), vocab_size)
feat_shape = line_json['feat_shape']
assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
if args.feat_type == 'raw':
filetype = feat_type(line_json['feat'])
if filetype == 'sound':
feat_shape.append(feat_dim)
else: # kaldi
raise NotImplementedError('no support kaldi feat now!')
......
#!/usr/bin/env bash
# Copyright 2019 Johns Hopkins University (Shinji Watanabe)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
[ -f ./path.sh ] && . ./path.sh
results=""
# e.g., "exp/tr_it_pytorch_train/decode_dt_it_decode/result.wrd.txt
# exp/tr_it_pytorch_train/decode_et_it_decode/result.wrd.txt"'
lm=""
dict=""
etc=""
outfile="model"
preprocess_conf=""
help_message=$(cat <<EOF
Usage: $0 --lm <lm> --dict <dict> <train_conf> <dec_conf> <cmvn> <e2e>, for example:
<lm>: exp/train_rnnlm/rnnlm.model.best
<dict>: data/lang_char
<train_conf>: conf/train.yaml
<dec_conf>: conf/decode.yaml
<cmvn>: data/tr_it/cmvn.ark
<e2e>: exp/tr_it_pytorch_train/results/model.last10.avg.best
EOF
)
. utils/parse_options.sh
if [ $# != 4 ]; then
echo "${help_message}"
exit 1
fi
tr_conf=$1
dec_conf=$2
cmvn=$3
e2e=$4
echo " - Model files (archived to ${outfile}.tar.gz by \`\$ pack_model.sh\`)"
echo " - model link: (put the model link manually.)"
# configs
if [ -e ${tr_conf} ]; then
tar cfh ${outfile}.tar ${tr_conf}
echo -n " - training config file: \`"
echo ${tr_conf} | sed -e "s/$/\`/"
else
echo "missing ${tr_conf}"
exit 1
fi
if [ -e ${dec_conf} ]; then
tar rfh ${outfile}.tar ${dec_conf}
echo -n " - decoding config file: \`"
echo ${dec_conf} | sed -e "s/$/\`/"
else
echo "missing ${dec_conf}"
exit 1
fi
# NOTE(kan-bayashi): preprocess conf is optional
if [ -n "${preprocess_conf}" ]; then
tar rfh ${outfile}.tar ${preprocess_conf}
echo -n " - preprocess config file: \`"
echo ${preprocess_conf} | sed -e "s/$/\`/"
fi
# cmvn
if [ -e ${cmvn} ]; then
tar rfh ${outfile}.tar ${cmvn}
echo -n " - cmvn file: \`"
echo ${cmvn} | sed -e "s/$/\`/"
else
echo "missing ${cmvn}"
exit 1
fi
# e2e
if [ -e ${e2e} ]; then
tar rfh ${outfile}.tar ${e2e}
echo -n " - e2e file: \`"
echo ${e2e} | sed -e "s/$/\`/"
e2e_conf=$(dirname ${e2e})/model.json
if [ ! -e ${e2e_conf} ]; then
echo missing ${e2e_conf}
#exit 1
else
echo -n " - e2e JSON file: \`"
echo ${e2e_conf} | sed -e "s/$/\`/"
tar rfh ${outfile}.tar ${e2e_conf}
fi
else
echo "missing ${e2e}"
exit 1
fi
# lm
if [ -n "${lm}" ]; then
if [ -e ${lm} ]; then
tar rfh ${outfile}.tar ${lm}
echo -n " - lm file: \`"
echo ${lm} | sed -e "s/$/\`/"
lm_conf=$(dirname ${lm})/model.json
if [ ! -e ${lm_conf} ]; then
echo missing ${lm_conf}
exit 1
else
echo -n " - lm JSON file: \`"
echo ${lm_conf} | sed -e "s/$/\`/"
tar rfh ${outfile}.tar ${lm_conf}
fi
else
echo "missing ${lm}"
exit 1
fi
fi
# dict
if [ -n "${dict}" ]; then
if [ -e ${dict} ]; then
tar rfh ${outfile}.tar ${dict}
echo -n " - dict file: \`"
echo ${dict} | sed -e "s/$/\`/"
else
echo "missing ${dict}"
exit 1
fi
fi
# etc
for x in ${etc}; do
if [ -e ${x} ]; then
tar rfh ${outfile}.tar ${x}
echo -n " - etc file: \`"
echo ${x} | sed -e "s/$/\`/"
else
echo "missing ${x}"
exit 1
fi
done
# finally compress the tar file
gzip -f ${outfile}.tar
# results
if [ -n "${results}" ]; then
echo " - Results (paste them by yourself or obtained by \`\$ pack_model.sh --results <results>\`)"
echo "\`\`\`"
fi
for x in ${results}; do
if [ -e ${x} ]; then
echo "${x}"
grep -e Avg -e SPKR -m 2 ${x}
else
echo "missing ${x}"
exit 1
fi
done
if [ -n "${results}" ]; then
echo "\`\`\`"
fi
exit 0
#!/usr/bin/env python3
"""remove longshort data from manifest"""
import argparse
import logging
import jsonlines
from paddlespeech.s2t.utils.cli_utils import get_commandline_args
# manifest after format
# josnline like this
# {
# "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
# "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
# "utt2spk": "111-2222",
# "utt": "111-2222-333"
# }
def get_parser():
parser = argparse.ArgumentParser(
description="remove longshort data from format manifest",
formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
parser.add_argument(
"--verbose", "-V", default=0, type=int, help="Verbose option")
parser.add_argument(
"--iaxis",
default=0,
type=int,
help="multi inputs index, 0 is the first")
parser.add_argument(
"--oaxis",
default=0,
type=int,
help="multi outputs index, 0 is the first")
parser.add_argument("--maxframes", default=2000, type=int, help="maxframes")
parser.add_argument("--minframes", default=10, type=int, help="minframes")
parser.add_argument("--maxchars", default=200, type=int, help="max tokens")
parser.add_argument("--minchars", default=0, type=int, help="min tokens")
parser.add_argument(
"--stride_ms", default=10, type=int, help="stride in ms unit.")
parser.add_argument(
"rspecifier",
type=str,
help="jsonl format manifest. e.g. manifest.jsonl")
parser.add_argument(
"wspecifier_or_wxfilename",
type=str,
help="Write specifier. e.g. manifest.jsonl")
return parser
def filter_input(args, line):
tmp = line['input'][args.iaxis]
if args.sound:
# second to frame
nframe = tmp['shape'][0] * 1000 / args.stride_ms
else:
nframe = tmp['shape'][0]
if nframe < args.minframes or nframe > args.maxframes:
return True
else:
return False
def filter_output(args, line):
nchars = len(line['output'][args.iaxis]['text'])
if nchars < args.minchars or nchars > args.maxchars:
return True
else:
return False
def main():
args = get_parser().parse_args()
logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
if args.verbose > 0:
logging.basicConfig(level=logging.INFO, format=logfmt)
else:
logging.basicConfig(level=logging.WARN, format=logfmt)
logging.info(get_commandline_args())
with jsonlines.open(args.rspecifier, 'r') as reader:
lines = list(reader)
logging.info(f"Example: {len(lines)}")
feat = lines[0]['input'][args.iaxis]['feat']
args.soud = False
if feat.split('.')[-1] not in 'ark, scp':
args.sound = True
count = 0
filter = 0
with jsonlines.open(args.wspecifier_or_wxfilename, 'w') as writer:
for line in lines:
if filter_input(args, line) or filter_output(args, line):
filter += 1
continue
writer.write(line)
count += 1
logging.info(f"Example after filter: {count}\{filter}")
if __name__ == '__main__':
main()
#!/usr/bin/env bash
mindepth=0
maxdepth=1
. utils/parse_options.sh
if [ $# -gt 1 ]; then
echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2
echo ""
echo "Show the system environments and the evaluation results in Markdown format."
echo 'The default of <exp> is "exp/".'
exit 1
fi
[ -f ./path.sh ] && . ./path.sh
set -euo pipefail
if [ $# -eq 1 ]; then
exp=$1
else
exp=exp
fi
cat << EOF
<!-- Generated by $0 -->
# RESULTS
## Environments
- date: \`$(LC_ALL=C date)\`
EOF
python3 << EOF
import sys, paddle
pyversion = sys.version.replace('\n', ' ')
print(f"""- python version: \`{pyversion}\`
- paddle version: \`paddle {paddle.__version__}\`""")
EOF
cat << EOF
- Git hash: \`$(git rev-parse HEAD)\`
- Commit date: \`$(git log -1 --format='%cd')\`
EOF
while IFS= read -r expdir; do
if ls ${expdir}/decode_*/result.txt &> /dev/null; then
# 1. Show the result table
cat << EOF
## $(basename ${expdir})
### CER
|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
|---|---|---|---|---|---|---|---|---|
EOF
grep -e Avg ${expdir}/decode_*/result.txt \
| sed -e "s#${expdir}/\([^/]*\)/result.txt:#|\1#g" \
| sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
echo
# 2. Show the result table for WER
if ls ${expdir}/decode_*/result.wrd.txt &> /dev/null; then
cat << EOF
### WER
|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
|---|---|---|---|---|---|---|---|---|
EOF
grep -e Avg ${expdir}/decode_*/result.wrd.txt \
| sed -e "s#${expdir}/\([^/]*\)/result.wrd.txt:#|\1#g" \
| sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
echo
fi
fi
done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册