提交 b69021f9 编写于 作者: H Hui Zhang

fix ds2 scripts and bugs

上级 09ab9f71
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inferer for DeepSpeech2 model."""
from deepspeech.exps.deepspeech2.config import get_cfg_defaults
from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester as Tester
from deepspeech.training.cli import default_argument_parser
from deepspeech.utils.utility import print_arguments
# TODO(hui zhang): dynamic load
def main_sp(config, args):
exp = Tester(config, args)
exp.setup()
exp.run_test()
def main(config, args):
main_sp(config, args)
if __name__ == "__main__":
parser = default_argument_parser()
args = parser.parse_args()
print_arguments(args, globals())
# https://yaml.org/type/float.html
config = get_cfg_defaults()
if args.config:
config.merge_from_file(args.config)
if args.opts:
config.merge_from_list(args.opts)
config.freeze()
print(config)
if args.dump_config:
with open(args.dump_config, 'w') as f:
print(config, file=f)
main(config, args)
...@@ -113,6 +113,7 @@ class DeepSpeech2Trainer(Trainer): ...@@ -113,6 +113,7 @@ class DeepSpeech2Trainer(Trainer):
if self.parallel: if self.parallel:
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
logger.info(f"{model}")
layer_tools.print_params(model, logger.info) layer_tools.print_params(model, logger.info)
grad_clip = ClipGradByGlobalNormWithLog( grad_clip = ClipGradByGlobalNormWithLog(
...@@ -192,7 +193,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): ...@@ -192,7 +193,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
trans.append(''.join([chr(i) for i in ids])) trans.append(''.join([chr(i) for i in ids]))
return trans return trans
def compute_metrics(self, audio, texts, audio_len, texts_len): def compute_metrics(self, audio, audio_len, texts, texts_len):
cfg = self.config.decoding cfg = self.config.decoding
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
...@@ -253,7 +254,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): ...@@ -253,7 +254,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
msg = "Test: " msg = "Test: "
msg += "epoch: {}, ".format(self.epoch) msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration) msg += "step: {}, ".format(self.iteration)
msg += ", Final error rate [%s] (%d/%d) = %f" % ( msg += "Final error rate [%s] (%d/%d) = %f" % (
error_rate_type, num_ins, num_ins, errors_sum / len_refs) error_rate_type, num_ins, num_ins, errors_sum / len_refs)
logger.info(msg) logger.info(msg)
...@@ -319,8 +320,9 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): ...@@ -319,8 +320,9 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
config.defrost() config.defrost()
# return raw text # return raw text
config.data.manifest = config.data.test_manifest config.data.keep_transcription_text = True
config.data.augmentation_config = "" config.data.augmentation_config = ""
config.data.manifest = config.data.test_manifest
test_dataset = ManifestDataset.from_config(config) test_dataset = ManifestDataset.from_config(config)
# return text ord id # return text ord id
......
...@@ -131,8 +131,8 @@ class FeatureNormalizer(object): ...@@ -131,8 +131,8 @@ class FeatureNormalizer(object):
def _read_mean_std_from_file(self, filepath, eps=1e-20): def _read_mean_std_from_file(self, filepath, eps=1e-20):
"""Load mean and std from file.""" """Load mean and std from file."""
mean, istd = load_cmvn(filepath, filetype='json') mean, istd = load_cmvn(filepath, filetype='json')
self._mean = mean self._mean = np.expand_dims(mean, axis=-1)
self._istd = istd self._istd = np.expand_dims(istd, axis=-1)
def write_to_file(self, filepath): def write_to_file(self, filepath):
"""Write the mean and stddev to the file. """Write the mean and stddev to the file.
......
...@@ -60,7 +60,7 @@ class SpeechCollator(): ...@@ -60,7 +60,7 @@ class SpeechCollator():
# else text is string, convert to unicode ord # else text is string, convert to unicode ord
tokens = [] tokens = []
if self._keep_transcription_text: if self._keep_transcription_text:
assert isinstance(text, str), type(text) assert isinstance(text, str), (type(text), text)
tokens = [ord(t) for t in text] tokens = [ord(t) for t in text]
else: else:
tokens = text # token ids tokens = text # token ids
......
...@@ -154,9 +154,9 @@ class DeepSpeech2Model(nn.Layer): ...@@ -154,9 +154,9 @@ class DeepSpeech2Model(nn.Layer):
assert (self.encoder.output_size == rnn_size * 2) assert (self.encoder.output_size == rnn_size * 2)
self.decoder = CTCDecoder( self.decoder = CTCDecoder(
odim=dict_size + 1, # <blank> is append after vocab odim=dict_size, # <blank> is in vocab
enc_n_units=self.encoder.output_size, enc_n_units=self.encoder.output_size,
blank_id=dict_size, # last token is <blank> blank_id=0, # first token is <blank>
dropout_rate=0.0, dropout_rate=0.0,
reduction=True, # sum reduction=True, # sum
batch_average=True) # sum / batch_size batch_average=True) # sum / batch_size
......
...@@ -63,7 +63,7 @@ class U2BaseModel(nn.Module): ...@@ -63,7 +63,7 @@ class U2BaseModel(nn.Module):
default = CfgNode() default = CfgNode()
# allow add new item when merge_with_file # allow add new item when merge_with_file
default.cmvn_file = "" default.cmvn_file = ""
default.cmvn_file_type = "npz" default.cmvn_file_type = "json"
default.input_dim = 0 default.input_dim = 0
default.output_dim = 0 default.output_dim = 0
# encoder related # encoder related
......
...@@ -40,7 +40,8 @@ def sequence_mask(x_len, max_len=None, dtype='float32'): ...@@ -40,7 +40,8 @@ def sequence_mask(x_len, max_len=None, dtype='float32'):
[[1., 1., 0., 0.], [[1., 1., 0., 0.],
[1., 1., 1., 1.]] [1., 1., 1., 1.]]
""" """
assert x_len.dim() == 1 # (TODO: Hui Zhang): jit not support Tenosr.dim() and Tensor.ndim
# assert x_len.dim() == 1, (x_len.dim(), x_len)
max_len = max_len or x_len.max() max_len = max_len or x_len.max()
x_len = paddle.unsqueeze(x_len, -1) x_len = paddle.unsqueeze(x_len, -1)
row_vector = paddle.arange(max_len) row_vector = paddle.arange(max_len)
......
...@@ -127,7 +127,12 @@ class Trainer(): ...@@ -127,7 +127,12 @@ class Trainer():
@mp_tools.rank_zero_only @mp_tools.rank_zero_only
def save(self, tag=None, infos: dict=None): def save(self, tag=None, infos: dict=None):
"""Save checkpoint (model parameters and optimizer states). """Save checkpoint (model parameters and optimizer states).
Args:
tag (int or str, optional): None for step, else using tag, e.g epoch. Defaults to None.
infos (dict, optional): meta data to save. Defaults to None.
""" """
infos = infos if infos else dict() infos = infos if infos else dict()
infos.update({ infos.update({
"step": self.iteration, "step": self.iteration,
...@@ -220,7 +225,7 @@ class Trainer(): ...@@ -220,7 +225,7 @@ class Trainer():
'epoch', {'cv_loss': cv_loss, 'epoch', {'cv_loss': cv_loss,
'lr': self.lr_scheduler()}, self.epoch) 'lr': self.lr_scheduler()}, self.epoch)
self.save(infos={'val_loss': cv_loss}) self.save(tag=self.epoch, infos={'val_loss': cv_loss})
self.lr_scheduler.step() self.lr_scheduler.step()
self.new_epoch() self.new_epoch()
......
...@@ -7,14 +7,20 @@ data: ...@@ -7,14 +7,20 @@ data:
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 64 # one gpu batch_size: 64 # one gpu
max_duration: 27.0 min_input_len: 0.0
min_duration: 0.0 max_input_len: 27.0 # second
min_output_len: 0.0
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
specgram_type: linear specgram_type: linear
target_sample_rate: 16000 target_sample_rate: 16000
max_freq: None max_freq: None
n_fft: None n_fft: None
stride_ms: 10.0 stride_ms: 10.0
window_ms: 20.0 window_ms: 20.0
delta_delta: False
dither: 1.0
use_dB_normalization: True use_dB_normalization: True
target_dB: -20 target_dB: -20
random_seed: 0 random_seed: 0
...@@ -36,6 +42,7 @@ training: ...@@ -36,6 +42,7 @@ training:
lr_decay: 0.83 lr_decay: 0.83
weight_decay: 1e-06 weight_decay: 1e-06
global_grad_clip: 5.0 global_grad_clip: 5.0
log_interval: 100
decoding: decoding:
batch_size: 128 batch_size: 128
......
#! /usr/bin/env bash
if [ $# != 2 ];then
echo "usage: ${0} ckpt_dir avg_num"
exit -1
fi
ckpt_dir=${1}
average_num=${2}
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
--dst_model ${decode_checkpoint} \
--ckpt_dir ${ckpt_dir} \
--num ${average_num} \
--val_best
if [ $? -ne 0 ]; then
echo "Failed in avg ckpt!"
exit 1
fi
exit 0
\ No newline at end of file
文件模式从 100644 更改为 100755
...@@ -43,17 +43,17 @@ fi ...@@ -43,17 +43,17 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# compute mean and stddev for normalizer # compute mean and stddev for normalizer
num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--specgram_type="fbank" \ --specgram_type="linear" \
--feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--stride_ms=10.0 \ --stride_ms=10.0 \
--window_ms=25.0 \ --window_ms=20.0 \
--sample_rate=16000 \ --sample_rate=16000 \
--use_dB_normalization=False \ --use_dB_normalization=False \
--num_samples=-1 \ --num_samples=-1 \
--num_workers=16 \ --num_workers=${num_workers} \
--output_path="data/mean_std.json" --output_path="data/mean_std.json"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
......
#! /usr/bin/env bash #! /usr/bin/env bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: export ckpt_path jit_model_path" echo "usage: $0 config_path ckpt_prefix jit_model_path"
exit -1 exit -1
fi fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
ckpt_path_prefix=$2
jit_model_export_path=$3
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
python3 -u ${BIN_DIR}/export.py \ python3 -u ${BIN_DIR}/export.py \
--config conf/deepspeech2.yaml \ --device ${device} \
--checkpoint_path ${1} \ --nproc ${ngpu} \
--export_path ${2} --config ${config_path} \
--checkpoint_path ${ckpt_path_prefix} \
--export_path ${jit_model_export_path}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in export!"
exit 1 exit 1
fi fi
......
文件模式从 100644 更改为 100755
#! /usr/bin/env bash #! /usr/bin/env bash
if [[ $# != 1 ]]; then if [ $# != 2 ];then
echo "usage: $0 ckpt-path" echo "usage: ${0} config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
config_path=$1
ckpt_prefix=$2
# download language model # download language model
bash local/download_lm_ch.sh bash local/download_lm_ch.sh
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
exit 1 exit 1
fi fi
python3 -u ${BIN_DIR}/infer.py \ python3 -u ${BIN_DIR}/test.py \
--device 'gpu' \ --device ${device} \
--nproc 1 \ --nproc 1 \
--config conf/deepspeech2.yaml \ --config ${config_path} \
--checkpoint_path ${1} --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in inference!" echo "Failed in evaluation!"
exit 1 exit 1
fi fi
......
#! /usr/bin/env bash #! /usr/bin/env bash
# train model if [ $# != 2 ];then
# if you wish to resume from an exists model, uncomment --init_from_pretrained_model echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
#export FLAGS_sync_nccl_allreduce=0 exit -1
fi
ngpu=$(echo ${CUDA_VISIBLE_DEVICES} | python -c 'import sys; a = sys.stdin.read(); print(len(a.split(",")));') ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1
ckpt_name=$2
device=gpu
if [ ngpu == 0 ];then
device=cpu
fi
mkdir -p exp
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--device 'gpu' \ --device ${device} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config conf/deepspeech2.yaml \ --config ${config_path} \
--output ckpt-${1} --output exp/${ckpt_name}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in training!" echo "Failed in training!"
exit 1 exit 1
fi fi
exit 0 exit 0
文件模式从 100644 更改为 100755
#!/bin/bash #!/bin/bash
set -e
source path.sh source path.sh
# only demos
# prepare data gpus=0
bash ./local/data.sh stage=0
stop_stage=100
conf_path=conf/deepspeech2.yaml
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
avg_num=1
avg_ckpt=avg_${avg_num}
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/data.sh || exit -1
fi
# train model if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
CUDA_VISIBLE_DEVICES=0,1,2,3 bash ./local/train.sh baseline # train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
fi
# test model if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh # avg n best model
./local/avg.sh exp/${ckpt}/checkpoints ${avg_num}
fi
# infer model if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=0 bash ./local/infer.sh ckpt/checkpoints/step-3284 # test ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
# export model if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
bash ./local/export.sh ckpt/checkpoints/step-3284 jit.model # export ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi
../../s0/local/data.sh
\ No newline at end of file
#! /usr/bin/env bash
stage=-1
stop_stage=100
source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# download data, generate manifests
python3 ${TARGET_DIR}/aishell/aishell.py \
--manifest_prefix="data/manifest" \
--target_dir="${TARGET_DIR}/aishell"
if [ $? -ne 0 ]; then
echo "Prepare Aishell failed. Terminated."
exit 1
fi
for dataset in train dev test; do
mv data/manifest.${dataset} data/manifest.${dataset}.raw
done
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# download data, generate manifests
# build vocabulary
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--manifest_paths "data/manifest.train.raw"
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# compute mean and stddev for normalizer
num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--specgram_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--stride_ms=10.0 \
--window_ms=25.0 \
--sample_rate=16000 \
--use_dB_normalization=False \
--num_samples=-1 \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"
if [ $? -ne 0 ]; then
echo "Compute mean and stddev failed. Terminated."
exit 1
fi
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
for dataset in train dev test; do
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
--vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.${dataset}.raw" \
--output_path="data/manifest.${dataset}"
done
if [ $? -ne 0 ]; then
echo "Formt mnaifest failed. Terminated."
exit 1
fi
fi
echo "Aishell data preparation done."
exit 0
...@@ -24,6 +24,7 @@ import distutils.util ...@@ -24,6 +24,7 @@ import distutils.util
import io import io
import json import json
import os import os
from multiprocessing.pool import Pool
import soundfile import soundfile
...@@ -122,42 +123,36 @@ def main(): ...@@ -122,42 +123,36 @@ def main():
if args.target_dir.startswith('~'): if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir) args.target_dir = os.path.expanduser(args.target_dir)
prepare_dataset( tasks = [
url=URL_TEST_CLEAN, (URL_TEST_CLEAN, MD5_TEST_CLEAN, os.path.join(args.target_dir,
md5sum=MD5_TEST_CLEAN, "test-clean"),
target_dir=os.path.join(args.target_dir, "test-clean"), args.manifest_prefix + ".test-clean"),
manifest_path=args.manifest_prefix + ".test-clean") (URL_DEV_CLEAN, MD5_DEV_CLEAN, os.path.join(
prepare_dataset( args.target_dir, "dev-clean"), args.manifest_prefix + ".dev-clean"),
url=URL_DEV_CLEAN, ]
md5sum=MD5_DEV_CLEAN,
target_dir=os.path.join(args.target_dir, "dev-clean"),
manifest_path=args.manifest_prefix + ".dev-clean")
if args.full_download: if args.full_download:
prepare_dataset( tasks.extend([
url=URL_TRAIN_CLEAN_100, (URL_TRAIN_CLEAN_100, MD5_TRAIN_CLEAN_100,
md5sum=MD5_TRAIN_CLEAN_100, os.path.join(args.target_dir, "train-clean-100"),
target_dir=os.path.join(args.target_dir, "train-clean-100"), args.manifest_prefix + ".train-clean-100"),
manifest_path=args.manifest_prefix + ".train-clean-100") (URL_TEST_OTHER, MD5_TEST_OTHER, os.path.join(args.target_dir,
prepare_dataset( "test-other"),
url=URL_TEST_OTHER, args.manifest_prefix + ".test-other"),
md5sum=MD5_TEST_OTHER, (URL_DEV_OTHER, MD5_DEV_OTHER, os.path.join(args.target_dir,
target_dir=os.path.join(args.target_dir, "test-other"), "dev-other"),
manifest_path=args.manifest_prefix + ".test-other") args.manifest_prefix + ".dev-other"),
prepare_dataset( (URL_TRAIN_CLEAN_360, MD5_TRAIN_CLEAN_360,
url=URL_DEV_OTHER, os.path.join(args.target_dir, "train-clean-360"),
md5sum=MD5_DEV_OTHER, args.manifest_prefix + ".train-clean-360"),
target_dir=os.path.join(args.target_dir, "dev-other"), (URL_TRAIN_OTHER_500, MD5_TRAIN_OTHER_500,
manifest_path=args.manifest_prefix + ".dev-other") os.path.join(args.target_dir, "train-other-500"),
prepare_dataset( args.manifest_prefix + ".train-other-500"),
url=URL_TRAIN_CLEAN_360, ])
md5sum=MD5_TRAIN_CLEAN_360,
target_dir=os.path.join(args.target_dir, "train-clean-360"), with Pool(7) as pool:
manifest_path=args.manifest_prefix + ".train-clean-360") pool.starmap(prepare_dataset, tasks)
prepare_dataset(
url=URL_TRAIN_OTHER_500, print("Data download and manifest prepare done!")
md5sum=MD5_TRAIN_OTHER_500,
target_dir=os.path.join(args.target_dir, "train-other-500"),
manifest_path=args.manifest_prefix + ".train-other-500")
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -23,6 +23,7 @@ import codecs ...@@ -23,6 +23,7 @@ import codecs
import io import io
import json import json
import os import os
from multiprocessing.pool import Pool
import soundfile import soundfile
...@@ -103,16 +104,18 @@ def main(): ...@@ -103,16 +104,18 @@ def main():
if args.target_dir.startswith('~'): if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir) args.target_dir = os.path.expanduser(args.target_dir)
prepare_dataset( tasks = [
url=URL_TRAIN_CLEAN, (URL_TRAIN_CLEAN, MD5_TRAIN_CLEAN,
md5sum=MD5_TRAIN_CLEAN, os.path.join(args.target_dir, "train-clean"),
target_dir=os.path.join(args.target_dir, "train-clean"), args.manifest_prefix + ".train-clean"),
manifest_path=args.manifest_prefix + ".train-clean") (URL_DEV_CLEAN, MD5_DEV_CLEAN, os.path.join(
prepare_dataset( args.target_dir, "dev-clean"), args.manifest_prefix + ".dev-clean"),
url=URL_DEV_CLEAN, ]
md5sum=MD5_DEV_CLEAN,
target_dir=os.path.join(args.target_dir, "dev-clean"), with Pool(2) as pool:
manifest_path=args.manifest_prefix + ".dev-clean") pool.starmap(prepare_dataset, tasks)
print("Data download and manifest prepare done!")
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -7,14 +7,20 @@ data: ...@@ -7,14 +7,20 @@ data:
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 20 batch_size: 20
max_duration: 27.0 min_input_len: 0.0
min_duration: 0.0 max_input_len: 27.0 # second
min_output_len: 0.0
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
specgram_type: linear specgram_type: linear
target_sample_rate: 16000 target_sample_rate: 16000
max_freq: None max_freq: None
n_fft: None n_fft: None
stride_ms: 10.0 stride_ms: 10.0
window_ms: 20.0 window_ms: 20.0
delta_delta: False
dither: 1.0
use_dB_normalization: True use_dB_normalization: True
target_dB: -20 target_dB: -20
random_seed: 0 random_seed: 0
...@@ -22,18 +28,22 @@ data: ...@@ -22,18 +28,22 @@ data:
sortagrad: True sortagrad: True
shuffle_method: batch_shuffle shuffle_method: batch_shuffle
num_workers: 0 num_workers: 0
model: model:
num_conv_layers: 2 num_conv_layers: 2
num_rnn_layers: 3 num_rnn_layers: 3
rnn_layer_size: 2048 rnn_layer_size: 2048
use_gru: False use_gru: False
share_rnn_weights: True share_rnn_weights: True
training: training:
n_epoch: 50 n_epoch: 50
lr: 1e-3 lr: 1e-3
lr_decay: 0.83 lr_decay: 0.83
weight_decay: 1e-06 weight_decay: 1e-06
global_grad_clip: 5.0 global_grad_clip: 5.0
log_interval: 100
decoding: decoding:
batch_size: 128 batch_size: 128
error_rate_type: wer error_rate_type: wer
......
#! /usr/bin/env bash
if [ $# != 2 ];then
echo "usage: ${0} ckpt_dir avg_num"
exit -1
fi
ckpt_dir=${1}
average_num=${2}
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
--dst_model ${decode_checkpoint} \
--ckpt_dir ${ckpt_dir} \
--num ${average_num} \
--val_best
if [ $? -ne 0 ]; then
echo "Failed in avg ckpt!"
exit 1
fi
exit 0
\ No newline at end of file
#! /usr/bin/env bash #! /usr/bin/env bash
stage=-1
stop_stage=100
unit_type=char
source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data mkdir -p data
TARGET_DIR=${MAIN_ROOT}/examples/dataset TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR} mkdir -p ${TARGET_DIR}
# download data, generate manifests if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
PYTHONPATH=.:$PYTHONPATH python3 ${TARGET_DIR}/librispeech/librispeech.py \ # download data, generate manifests
--manifest_prefix="data/manifest" \ python3 ${TARGET_DIR}/librispeech/librispeech.py \
--target_dir="${TARGET_DIR}/librispeech" \ --manifest_prefix="data/manifest" \
--full_download="True" --target_dir="${TARGET_DIR}/librispeech" \
--full_download="True"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Prepare LibriSpeech failed. Terminated." echo "Prepare LibriSpeech failed. Terminated."
exit 1 exit 1
fi fi
for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
mv data/manifest.${set} data/manifest.${set}.raw
done
for set in train-clean-100 train-clean-360 train-other-500; do
cat data/manifest.${set}.raw >> data/manifest.train.raw
done
for set in dev-clean dev-other; do
cat data/manifest.${set}.raw >> data/manifest.dev.raw
done
cat data/manifest.train-* | shuf > data/manifest.train for set in test-clean test-other; do
cat data/manifest.${set}.raw >> data/manifest.test.raw
done
fi
# build vocabulary if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/utils/build_vocab.py \ # build vocabulary
--count_threshold=0 \ python3 ${MAIN_ROOT}/utils/build_vocab.py \
--vocab_path="data/vocab.txt" \ --unit_type ${unit_type} \
--manifest_paths="data/manifest.train" --count_threshold=0 \
--vocab_path="data/vocab.txt" \
--manifest_paths="data/manifest.train.raw"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated." echo "Build vocabulary failed. Terminated."
exit 1 exit 1
fi
fi fi
# compute mean and stddev for normalizer
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train" \
--num_samples=2000 \
--specgram_type="linear" \
--output_path="data/mean_std.npz"
if [ $? -ne 0 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# compute mean and stddev for normalizer
num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--num_samples=-1 \
--specgram_type="linear" \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=20.0 \
--use_dB_normalization=False \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"
if [ $? -ne 0 ]; then
echo "Compute mean and stddev failed. Terminated." echo "Compute mean and stddev failed. Terminated."
exit 1 exit 1
fi
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
for set in train dev test dev-clean dev-other test-clean test-other; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type ${unit_type} \
--vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.${set}.raw" \
--output_path="data/manifest.${set}"
if [ $? -ne 0 ]; then
echo "Formt mnaifest.${set} failed. Terminated."
exit 1
fi
}&
done
wait
fi fi
echo "LibriSpeech Data preparation done." echo "LibriSpeech Data preparation done."
......
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
...@@ -3,18 +3,24 @@ data: ...@@ -3,18 +3,24 @@ data:
train_manifest: data/manifest.tiny train_manifest: data/manifest.tiny
dev_manifest: data/manifest.tiny dev_manifest: data/manifest.tiny
test_manifest: data/manifest.tiny test_manifest: data/manifest.tiny
mean_std_filepath: data/mean_std.npz mean_std_filepath: data/mean_std.json
vocab_filepath: data/vocab.txt vocab_filepath: data/vocab.txt
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 4 batch_size: 4
max_duration: 27.0 min_input_len: 0.0
min_duration: 0.0 max_input_len: 27.0
min_output_len: 0.0
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
specgram_type: linear specgram_type: linear
target_sample_rate: 16000 target_sample_rate: 16000
max_freq: None max_freq: None
n_fft: None n_fft: None
stride_ms: 10.0 stride_ms: 10.0
window_ms: 20.0 window_ms: 20.0
delta_delta: False
dither: 1.0
use_dB_normalization: True use_dB_normalization: True
target_dB: -20 target_dB: -20
random_seed: 0 random_seed: 0
...@@ -22,18 +28,22 @@ data: ...@@ -22,18 +28,22 @@ data:
sortagrad: True sortagrad: True
shuffle_method: batch_shuffle shuffle_method: batch_shuffle
num_workers: 0 num_workers: 0
model: model:
num_conv_layers: 2 num_conv_layers: 2
num_rnn_layers: 3 num_rnn_layers: 3
rnn_layer_size: 2048 rnn_layer_size: 2048
use_gru: False use_gru: False
share_rnn_weights: True share_rnn_weights: True
training: training:
n_epoch: 20 n_epoch: 20
lr: 1e-5 lr: 1e-5
lr_decay: 1.0 lr_decay: 1.0
weight_decay: 1e-06 weight_decay: 1e-06
global_grad_clip: 5.0 global_grad_clip: 5.0
log_interval: 1
decoding: decoding:
batch_size: 128 batch_size: 128
error_rate_type: wer error_rate_type: wer
......
#! /usr/bin/env bash
if [ $# != 2 ];then
echo "usage: ${0} ckpt_dir avg_num"
exit -1
fi
ckpt_dir=${1}
average_num=${2}
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
python3 -u ${MAIN_ROOT}/utils/avg_model.py \
--dst_model ${decode_checkpoint} \
--ckpt_dir ${ckpt_dir} \
--num ${average_num} \
--val_best
if [ $? -ne 0 ]; then
echo "Failed in avg ckpt!"
exit 1
fi
exit 0
\ No newline at end of file
...@@ -3,10 +3,7 @@ ...@@ -3,10 +3,7 @@
stage=-1 stage=-1
stop_stage=100 stop_stage=100
# bpemode (unigram or bpe) unit_type=char
nbpe=200
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
source ${MAIN_ROOT}/utils/parse_options.sh source ${MAIN_ROOT}/utils/parse_options.sh
...@@ -32,10 +29,8 @@ fi ...@@ -32,10 +29,8 @@ fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# build vocabulary # build vocabulary
python3 ${MAIN_ROOT}/utils/build_vocab.py \ python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type "spm" \ --unit_type ${unit_type} \
--spm_vocab_size=${nbpe} \ --count_threshold=0 \
--spm_mode ${bpemode} \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \
--manifest_paths="data/manifest.tiny.raw" --manifest_paths="data/manifest.tiny.raw"
...@@ -51,12 +46,11 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -51,12 +46,11 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.tiny.raw" \ --manifest_path="data/manifest.tiny.raw" \
--num_samples=64 \ --num_samples=64 \
--specgram_type="fbank" \ --specgram_type="linear" \
--feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=16000 \
--stride_ms=10.0 \ --stride_ms=10.0 \
--window_ms=25.0 \ --window_ms=20.0 \
--use_dB_normalization=False \ --use_dB_normalization=False \
--num_workers=2 \ --num_workers=2 \
--output_path="data/mean_std.json" --output_path="data/mean_std.json"
...@@ -73,8 +67,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -73,8 +67,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${MAIN_ROOT}/utils/format_data.py \ python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \ --feat_type "raw" \
--cmvn_path "data/mean_std.json" \ --cmvn_path "data/mean_std.json" \
--unit_type "spm" \ --unit_type ${unit_type} \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.tiny.raw" \ --manifest_path="data/manifest.tiny.raw" \
--output_path="data/manifest.tiny" --output_path="data/manifest.tiny"
......
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
set -e set -e
source path.sh source path.sh
gpus=0
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/deepspeech2.yaml conf_path=conf/deepspeech2.yaml
...@@ -18,7 +19,7 @@ fi ...@@ -18,7 +19,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir # train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=0 ./local/train.sh ${conf_path} ${ckpt} CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
fi fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
...@@ -28,10 +29,10 @@ fi ...@@ -28,10 +29,10 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# export ckpt avg_n # export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi fi
../../s0/local/data.sh
\ No newline at end of file
#! /usr/bin/env bash
stage=-1
stop_stage=100
# bpemode (unigram or bpe)
nbpe=200
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# download data, generate manifests
python3 ${TARGET_DIR}/librispeech/librispeech.py \
--manifest_prefix="data/manifest" \
--target_dir="${TARGET_DIR}/librispeech" \
--full_download="False"
if [ $? -ne 0 ]; then
echo "Prepare LibriSpeech failed. Terminated."
exit 1
fi
head -n 64 data/manifest.dev-clean > data/manifest.tiny.raw
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# build vocabulary
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type "spm" \
--spm_vocab_size=${nbpe} \
--spm_mode ${bpemode} \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \
--manifest_paths="data/manifest.tiny.raw"
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# compute mean and stddev for normalizer
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.tiny.raw" \
--num_samples=64 \
--specgram_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \
--window_ms=25.0 \
--use_dB_normalization=False \
--num_workers=2 \
--output_path="data/mean_std.json"
if [ $? -ne 0 ]; then
echo "Compute mean and stddev failed. Terminated."
exit 1
fi
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
python3 ${MAIN_ROOT}/utils/format_data.py \
--feat_type "raw" \
--cmvn_path "data/mean_std.json" \
--unit_type "spm" \
--spm_model_prefix ${bpeprefix} \
--vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.tiny.raw" \
--output_path="data/manifest.tiny"
if [ $? -ne 0 ]; then
echo "Formt mnaifest failed. Terminated."
exit 1
fi
fi
echo "LibriSpeech Data preparation done."
exit 0
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册